Skip to content
Snippets Groups Projects
Commit d55d81ca authored by jezozwierzak's avatar jezozwierzak
Browse files

Changed CSVWriter to CSVTable

parent 493c914d
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/python
#-*- coding: utf-8 -*-
'''
Created on 09-08-2012
@author: Adam Pawlaczek
'''
# Copyright (C) 2012 Adam Pawlaczek.
# This program is free software; you can redistribute and/or modify it
# under the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.
#
# See the LICENCE and COPYING files for more details
class CSVColumn:
def __init__(self, parent, name, separator, type):
self.parent = parent
self.name = name
self.type = type
self.width = len(name)
self.content = []
self.separator = separator
def addSubColumn(self, name, type = ''):
assert len(self.content) == 0 or self.hasSubColumns()
self.type = 'dict'
self.content.append(CSVColumn(self, name, self.separator, type))
self.recountWidths()
def insertValue(self, row, data, subColumn =''):
if self.hasSubColumns() and type(data).__name__ == 'dict':
for subColumn in self.content:
subColumn.insertValue(data[subColumn.name], row)
elif self.hasSubColumns():
for sub in self.content:
if sub.name == subColumn:
sub.content[row] = data
if type(data).__name__ == "float":
data_str = "%.4f"%data
else:
data_str = str(data)
if len(data_str) > sub.width:
sub.width = len(data_str)
break
elif subColumn == '':
self.content[row] = data
if type(data).__name__ == "float":
data_str = "%.4f"%data
else:
data_str = str(data)
if len(data_str) > self.width:
self.width = len(data_str)
def increment(self, row, subColumn=''):
if subColumn != '' and self.hasSubColumns():
for sub in self.content:
if sub.name == subColumn:
sub.content[row] += 1
if sub.type == "float":
data_str = "%.4f"%sub.content[row]
else:
data_str = str(sub.content[row])
if len(data_str) > sub.width:
self.width += len(data_str) - sub.width
sub.width = len(data_str)
break
else:
self.content[row] += 1
def getValue(self, row, subColumn =''):
if subColumn != '' and self.hasSubColumns():
for sub in self.content:
if sub.name == subColumn:
return sub.content[row]
else:
return self.content[row]
def addValue(self, data):
if self.hasSubColumns():
str_data = ''
for sub in self.content:
sub.addValue(data[sub.name])
str_data += str(data[sub.name]) + self.separator
if len(str_data) > self.width:
self.recountWidths()
else:
assert type(data).__name__ != 'dict', 'You added dict to column with no subcolumns'
if len(self.content) == 0:
self.type = type(data).__name__
self.content.append(data)
if len(str(data)) > self.width and self.isSubColumn():
self.parent.recountWidths()
elif len(str(data) + self.separator) > self.width:
self.width = len(str(data))
def countSum(self):
if self.hasSubColumns():
result = {}
for sub in self.content:
result[sub.name] = sub.countSum()
else:
if self.type != 'string':
result = 0
for row in self.content:
result += row
return result
def recountWidths(self):
subColsStr = ''
for subColumn in self.content:
subColsStr += subColumn.name + self.separator
if len(subColsStr) > self.width:
self.width = len(subColsStr)
for subColumn in self.content:
subColumn.width = len(subColumn.name + self.separator)
elif self.parent.rows > 0:
widths = 0
for i in range(0,len(self.content)): #Dla każdej podkolumny
for j in range(0, self.parent.rows): #Dla każdego wiersza
if i == len(self.content) - 1:
if self.width - widths > 0:
self.content[i].width = self.width - widths
else:
self.content[i].width = len(self.parent.ptr(self.content[i].content[j], 1))
break
elif len(self.parent.ptr(self.content[i].content[j], 1)) > self.content[i].width:
self.content[i].width = len(self.parent.ptr(self.content[i].content[j], 1))
widths += self.content[i].width
if widths > self.width:
self.width = widths
def hasSubColumns(self):
if len(self.content) > 0 and type(self.content[0]).__name__ == "instance":
return self.content[0].__class__ == CSVColumn
else:
return False
def isSubColumn(self):
if self.parent != '' and type(self.parent).__name__ == "instance" :
return self.parent.__class__ == CSVColumn
else:
return False
def fillZeros(self, rows):
if self.hasSubColumns():
for subcolumn in self.content:
subcolumn.fillZeros(rows)
else:
for i in range(0, rows):
if self.type == "float":
self.content.append(0.0)
elif self.type == "int":
self.content.append(0)
else:
self.content.append('')
\ No newline at end of file
#!/usr/bin/python
#-*- coding: utf-8 -*-
'''
Created on 09-08-2012
@author: Adam Pawlaczek
TODO: Ew. dodać float do obliczania AVG zamiast rzutowania na typ, w którym jest kolumna.
'''
# Copyright (C) 2012 Adam Pawlaczek.
# This program is free software; you can redistribute and/or modify it
# under the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.
#
# See the LICENCE and COPYING files for more details
from CSVColumn import CSVColumn
from operator import itemgetter
class CSVTable:
def __init__(self, separator = ';'):
self.widths = []
self.content = []
self.rows = 0
self.columns = 0
self.separator = separator
def addColumn(self, name, type = ''):
for column in self.content:
assert column.name != name, 'Column with name: '+ name+ ' already exists'
column = CSVColumn(self, name, self.separator, type)
self.content.append(column)
if self.rows > 0 and type != 'dict':
column.fillZeros(self.rows)
self.columns += 1
return True
def addSubColumn(self, parentName, name, type = ''):
for column in self.content:
if column.name == parentName:
column.addSubColumn(name, type)
if len(column.content) > 1:
self.columns+=1
if self.rows > 0:
column.content[-1].fillZeros(self.rows)
return True
return False
def addRow(self, row={}):
for column in self.content:
if column.name != 'Nr' or 'Nr' in row.keys():
column.addValue(row[column.name])
else:
if self.rows > 0:
column.addValue(column.content[self.rows-1] + 1)
else:
column.addValue(1)
self.rows += 1
def addEmptyRow(self):
for column in self.content:
if column.name == "Nr":
if self.rows > 0:
column.addValue(column.content[self.rows-1] + 1)
else:
column.addValue(1)
else:
column.fillZeros(1)
self.rows += 1
def insertInColumn(self, columnName, row, data, subColumn= ''):
assert row < self.rows
for column in self.content:
if column.name == columnName:
column.insertValue(row, data, subColumn)
def increment(self, columnName, row, subColumn=''):
for column in self.content:
if column.name == columnName:
column.increment(row, subColumn)
def getValue(self, columnName, row, subColumn= ''):
assert row < self.rows
for column in self.content:
if column.name == columnName:
return column.getValue(row, subColumn)
def hasSubColumns(self):
for column in self.content:
if column.hasSubColumns():
return True
return False
def hasNrColumn(self):
return self.hasColumn("Nr")
def hasColumn(self, name):
for column in self.content:
if column.name == name:
return True
return False
def countAvg(self):
results = {}
for column in self.content:
results[column.name] = column.countSum()
if column.type == 'int' or column.type == 'float':
results[column.name] = results[column.name] / self.rows
elif column.type == 'dict':
for subColumn in results[column.name].keys():
results[column.name][subColumn] = results[column.name][subColumn] / self.rows
if self.hasNrColumn():
results['Nr'] = 'AVG'
self.addRow(results)
def countSum(self):
results = {}
for column in self.content:
results[column.name] = column.countSum()
if self.hasNrColumn():
results['Nr'] = 'SUM'
self.addRow(results)
''' Helping functions '''
def __repeat_to_length(self, string_to_expand, length):
return (string_to_expand * ((length/len(string_to_expand))+1))[:length]
def ptr(self, data, width):
if type(data).__name__=='int':
return '{0:{width}{base}}'.format(data, base='d', width=width) + self.separator
elif type(data).__name__=='float':
return '{0:{width}{base}}'.format(data, base='.4f', width=width) + self.separator
else:
return '{0:{width}{base}}'.format(data, base='s', width=width) + self.separator
def __str__(self):
result = ''
#HEADERS
for column in self.content:
result += self.ptr(column.name, column.width)
if column.hasSubColumns():
for i in range(1,len(column.content)):
result += self.separator
result += '\n'
#SUBHEADERS
for column in self.content:
if self.hasSubColumns():
if column.hasSubColumns():
for subColumn in column.content:
if len(column.content) > 1:
result += self.ptr(subColumn.name, subColumn.width)
else:
result += self.ptr(subColumn.name, column.width)
else:
result += self.ptr(' ', column.width)
result += '\n'
#DATA
for i in range(0, self.rows): #Rows iteration
for column in self.content: #Column iteration
if column.hasSubColumns():
for subColumn in column.content:
result += self.ptr(subColumn.content[i], subColumn.width)
else:
result += self.ptr(column.content[i], column.width)
result += '\n'
return result
\ No newline at end of file
#!/usr/bin/python
#-*- coding: utf-8 -*-
'''
Created on 03-08-2012
@author: jezozwierzak
'''
class CSVWriter:
def __init__(self, separator = ';'):
self.widths = []
self.list = []
self.rows = 0
self.columns = 0
self.separator = separator
def addSubColumn(self, parentIndex, name):
parentColumn = self.list[parentIndex];
parentColumn.append([name])
subColsStr = ''
for i in range(1,len(parentColumn)):
subColsStr += parentColumn[i][0] + self.separator
if len(subColsStr) > self.widths[parentIndex]:
self.widths[parentIndex] = len(subColsStr) - 1
if len(parentColumn[1:]) > 1:
self.columns+=1
def addSubColumnByName(self, parentName, name):
assert self.rows == 0, 'You have to add all Column names before adding rows'
parentIndex = self.columnIndex(parentName)
self.addSubColumn(parentIndex, name)
def addSubColumnsByName(self, parentName, names=[]):
for name in names:
self.addSubColumnByName(parentName, name)
def addSubColumns(self, parentIndex, names=[]):
for name in names:
self.addSubColumn(parentIndex, name)
def addColumn(self, name):
assert self.rows == 0, 'You have to add all Column names before adding rows'
self.list.append([name])
self.widths.append(len(name))
self.columns+=1
def addColumns(self, names=[]):
for name in names:
self.addColumn(name)
def addRow(self, row=[]):
assert len(row) == len(self.list), 'Wrong number of columns in row'
for i in range(0,len(self.list)):
column = self.list[i]
if len(column) > 1 and type(column[1]).__name__ == 'list':
#Adding data to subcolumns
assert len(row[i]) == len(column) - 1, 'Wrong number of subColumns in column ' + column[0]
for j in range(0,len(row[i])):
column[j+1].append(row[i][j])
subColsStr = ''
for j in range(0,len(row[i])):
subColsStr += '{0:{base}}'.format(row[i][j], base='.4f') + self.separator + ' '
if len(subColsStr) > self.widths[i]:
self.widths[i] = len(subColsStr) - 1
else:
#Adding data to column
column.append(row[i])
if len(str(row[i])) > self.widths[i]:
self.widths[i] = len(str(row[i]))
self.rows+=1
def allWidth(self):
sum = 0
for width in self.widths:
sum += width
return width
def columnIndex(self, name):
for column in self.list:
if column[0] == name:
return self.list.index(column)
def hasSubColumns(self):
for column in self.list:
if len(column) > 1 and type(column[1]).__name__ == 'list':
return True
return False
def hasColumnSubColumns(self, index):
column = self.list[index]
return len(column) > 1 and type(column[1]).__name__ == 'list'
def repeat_to_length(self, string_to_expand, length):
return (string_to_expand * ((length/len(string_to_expand))+1))[:length]
def count_avg(self, ):
results = []
if not self.hasSubColumns():
for i in range(0,len(self.list)): #Iterowanie po kolumnach
results.append(0)
for j in range(1,1+self.rows): # Iterowanie po wierszach
results[i]+= self.list[i][j]
results[i]/=self.rows
else:
for i in range(0,len(self.list)): #Iterowanie po kolumnach
if self.hasColumnSubColumns(i):
subResults = []
for k in range(0,len(self.list[i][1:])):
subColumn = self.list[i][1:][k]
subResults.append(0)
for j in range(1,1+self.rows): # Iterowanie po wierszach
subResults[k]+= subColumn[j]
subResults[k]/=self.rows
results.append(subResults)
else:
results.append(0)
for j in range(1,1+self.rows): # Iterowanie po wierszach
results[i]+= self.list[i][j]
results[i]/=self.rows
results = results[1:]
results[:0] = ['AVG']
self.addRow(results)
def __str__(self):
result = ''
if not self.hasSubColumns():
for j in range(0,1+self.rows): # Iterowanie po wierszach
for i in range(0, len(self.list)): #Iterowanie po kolumnach
if type(self.list[i][j]).__name__=='int':
result += '{0:{width}{base}}'.format(self.list[i][j], base='d', width=self.widths[i]) + self.separator
elif type(self.list[i][j]).__name__=='float':
result += '{0:{width}{base}}'.format(self.list[i][j], base='.4f', width=self.widths[i]) + self.separator
else:
result += '{0:{width}{base}}'.format(self.list[i][j], base='s', width=self.widths[i]) + self.separator
result += '\n'
else:
#Printing Thead
thead = zip(*self.list)[0]
for i in range(0,len(thead)):
if self.hasColumnSubColumns(i):
numberOfColumns = len(self.list[i][1:])
result += '{0:{width}{base}}'.format(thead[i], base='s', width=self.widths[i]-numberOfColumns+1) + self.separator
for j in range(1,numberOfColumns):
result += self.separator
else:
result += '{0:{width}{base}}'.format(thead[i], base='s', width=self.widths[i]) + self.separator
result += '\n'
#Printing subTheads:
for i in range(0,len(self.list)):
if self.hasColumnSubColumns(i):
numberOfColumns = len(self.list[i][1:])
for subColumn in self.list[i][1:]:
result += '{0:{width}{base}}'.format(subColumn[0], base='s', width=(self.widths[i]/numberOfColumns)) + self.separator
else:
result += '{0:{width}{base}}'.format('', base='s', width=self.widths[i]) + self.separator
result += '\n'
#Printing Data
for j in range(1,1+self.rows): # Iterowanie po wierszach
for i in range(0, len(self.list)): #Iterowanie po kolumnach
if self.hasColumnSubColumns(i):
for subcolumns in self.list[i][1:]:
if type(subcolumns[j]).__name__=='int':
result += '{0:{width}{base}}'.format(subcolumns[j], base='d', width=(self.widths[i]/numberOfColumns)) + self.separator
elif type(subcolumns[j]).__name__=='float':
result += '{0:{width}{base}}'.format(subcolumns[j], base='.4f', width=(self.widths[i]/numberOfColumns)) + self.separator
else:
result += '{0:{width}{base}}'.format(subcolumns[j], base='s', width=(self.widths[i]/numberOfColumns)) + self.separator
else:
if type(self.list[i][j]).__name__=='int':
result += '{0:{width}{base}}'.format(self.list[i][j], base='d', width=self.widths[i]) + self.separator
elif type(self.list[i][j]).__name__=='float':
result += '{0:{width}{base}}'.format(self.list[i][j], base='.4f', width=self.widths[i]) + self.separator
else:
result += '{0:{width}{base}}'.format(self.list[i][j], base='s', width=self.widths[i]) + self.separator
result += '\n'
return result
......@@ -2,6 +2,8 @@
#-*- coding: utf-8 -*-
'''
Created on 01-08-2012
@author: Adam Pawlaczek
'''
# Copyright (C) 2012 Adam Pawlaczek.
# This program is free software; you can redistribute and/or modify it
......@@ -15,6 +17,12 @@ Created on 01-08-2012
#
# See the LICENCE and COPYING files for more details
from optparse import OptionParser
import corpus2
import sys, os
from CSVTable import CSVTable
import codecs
descr = """%prog [options] CHUNKED REF
Reads the two chunk-annotated corpora: CHUNKED (chunker output) and REF
......@@ -26,10 +34,6 @@ for the following settings:
NOTE: this script treats discontinuous chunks as whole annotations.
"""
from optparse import OptionParser
import corpus2
import sys, os
from CSVWriter import CSVWriter
class Stats:
def __init__(self):
......@@ -58,13 +62,17 @@ class Stats:
self.head_hits += len(ch.intersection(ref))
def getPRF(self, hits):
result = {}
p = 0.0 if self.ch_chunks == 0 else 100.0 * hits / self.ch_chunks
r = 0.0 if self.ref_chunks == 0 else 100.0 * hits / self.ref_chunks
f = 0.0 if p + r == 0.0 else 2.0 * p * r / (p + r)
return [p, r, f]
result['P'] = p
result['R'] = r
result['F'] = f
return result
def getStats(self):
return [self.getPRF(self.chunk_hits)]
return self.getPRF(self.chunk_hits)
def get_annots(sent, chan_name):
# wrap the sentence as an AnnotatedSentence
......@@ -106,12 +114,18 @@ def go():
ch_path, ref_path = args
main(ch_path, ref_path, options.chunk_names, options.input_format, options.out_path, options.tagset, options.verbose, options.folds)
def main(ch_path, ref_path, chan_name, input_format, out_path, tagset, verbose, folds):
def main(ch_path, ref_path, chan_names, input_format, out_path, tagset, verbose, folds):
csvWriter = CSVWriter(",")
chan_names = chan_names.split(",")
csvWriter.addColumns(["Nr ","Chunk"])
csvWriter.addSubColumnsByName("Chunk", ["P", "R", "F"])
csvTable = CSVTable(";")
csvTable.addColumn('Nr')
for chan_name in chan_names:
csvTable.addColumn(chan_name)
csvTable.addSubColumn(chan_name, "P", type="float")
csvTable.addSubColumn(chan_name, "R", type="float")
csvTable.addSubColumn(chan_name, "F", type="float")
tagset = corpus2.get_named_tagset(tagset)
......@@ -123,6 +137,10 @@ def main(ch_path, ref_path, chan_name, input_format, out_path, tagset, verbose,
ch_path_fold = ch_path
ref_path_fold = ref_path
results = {}
for chan_name in chan_names:
ch_rdr = corpus2.TokenReader.create_path_reader(
input_format, tagset, ch_path_fold)
ref_rdr = corpus2.TokenReader.create_path_reader(
......@@ -146,11 +164,18 @@ def main(ch_path, ref_path, chan_name, input_format, out_path, tagset, verbose,
ref_annots = get_annots(ref_sent, chan_name)
stats.update(ch_annots, ref_annots)
results = stats.getStats()
results[:0] = [fold]
csvWriter.addRow(results)
csvWriter.count_avg()
print csvWriter
results[chan_name] = stats.getStats()
csvTable.addRow(results)
if folds > 1:
csvTable.countAvg()
if out_path != '':
out = codecs.open(out_path, "w", "utf-8")
out.close()
else:
print csvTable
if __name__ == '__main__':
go()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment