From d55d81ca8241231b90f07592be1a8ab8af58a922 Mon Sep 17 00:00:00 2001 From: jezozwierzak <jezozwierzak@gmail.com> Date: Thu, 23 Aug 2012 09:53:35 +0200 Subject: [PATCH] Changed CSVWriter to CSVTable --- utils/CSVColumn.py | 166 ++++++++++++++++++++++++++++++++++++++ utils/CSVTable.py | 178 ++++++++++++++++++++++++++++++++++++++++ utils/CSVWriter.py | 192 -------------------------------------------- utils/chunk_eval.py | 95 ++++++++++++++-------- 4 files changed, 404 insertions(+), 227 deletions(-) create mode 100755 utils/CSVColumn.py create mode 100755 utils/CSVTable.py delete mode 100644 utils/CSVWriter.py diff --git a/utils/CSVColumn.py b/utils/CSVColumn.py new file mode 100755 index 0000000..ad551de --- /dev/null +++ b/utils/CSVColumn.py @@ -0,0 +1,166 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- +''' +Created on 09-08-2012 + +@author: Adam Pawlaczek +''' +# Copyright (C) 2012 Adam Pawlaczek. +# This program is free software; you can redistribute and/or modify it +# under the terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. +# +# See the LICENCE and COPYING files for more details + +class CSVColumn: + + def __init__(self, parent, name, separator, type): + self.parent = parent + self.name = name + self.type = type + self.width = len(name) + self.content = [] + self.separator = separator + + def addSubColumn(self, name, type = ''): + assert len(self.content) == 0 or self.hasSubColumns() + self.type = 'dict' + self.content.append(CSVColumn(self, name, self.separator, type)) + self.recountWidths() + + def insertValue(self, row, data, subColumn =''): + + if self.hasSubColumns() and type(data).__name__ == 'dict': + for subColumn in self.content: + subColumn.insertValue(data[subColumn.name], row) + elif self.hasSubColumns(): + for sub in self.content: + if sub.name == subColumn: + sub.content[row] = data + if type(data).__name__ == "float": + data_str = "%.4f"%data + else: + data_str = str(data) + if len(data_str) > sub.width: + sub.width = len(data_str) + break + elif subColumn == '': + self.content[row] = data + if type(data).__name__ == "float": + data_str = "%.4f"%data + else: + data_str = str(data) + if len(data_str) > self.width: + self.width = len(data_str) + + def increment(self, row, subColumn=''): + if subColumn != '' and self.hasSubColumns(): + for sub in self.content: + if sub.name == subColumn: + sub.content[row] += 1 + if sub.type == "float": + data_str = "%.4f"%sub.content[row] + else: + data_str = str(sub.content[row]) + if len(data_str) > sub.width: + self.width += len(data_str) - sub.width + sub.width = len(data_str) + break + else: + self.content[row] += 1 + + + def getValue(self, row, subColumn =''): + if subColumn != '' and self.hasSubColumns(): + for sub in self.content: + if sub.name == subColumn: + return sub.content[row] + else: + return self.content[row] + + def addValue(self, data): + if self.hasSubColumns(): + str_data = '' + for sub in self.content: + sub.addValue(data[sub.name]) + str_data += str(data[sub.name]) + self.separator + if len(str_data) > self.width: + self.recountWidths() + else: + assert type(data).__name__ != 'dict', 'You added dict to column with no subcolumns' + if len(self.content) == 0: + self.type = type(data).__name__ + self.content.append(data) + if len(str(data)) > self.width and self.isSubColumn(): + self.parent.recountWidths() + elif len(str(data) + self.separator) > self.width: + self.width = len(str(data)) + + def countSum(self): + if self.hasSubColumns(): + result = {} + for sub in self.content: + result[sub.name] = sub.countSum() + else: + if self.type != 'string': + result = 0 + for row in self.content: + result += row + return result + + def recountWidths(self): + subColsStr = '' + for subColumn in self.content: + subColsStr += subColumn.name + self.separator + + if len(subColsStr) > self.width: + self.width = len(subColsStr) + for subColumn in self.content: + subColumn.width = len(subColumn.name + self.separator) + elif self.parent.rows > 0: + widths = 0 + for i in range(0,len(self.content)): #Dla każdej podkolumny + for j in range(0, self.parent.rows): #Dla każdego wiersza + if i == len(self.content) - 1: + if self.width - widths > 0: + self.content[i].width = self.width - widths + else: + self.content[i].width = len(self.parent.ptr(self.content[i].content[j], 1)) + break + elif len(self.parent.ptr(self.content[i].content[j], 1)) > self.content[i].width: + self.content[i].width = len(self.parent.ptr(self.content[i].content[j], 1)) + widths += self.content[i].width + if widths > self.width: + self.width = widths + + def hasSubColumns(self): + if len(self.content) > 0 and type(self.content[0]).__name__ == "instance": + return self.content[0].__class__ == CSVColumn + else: + return False + + def isSubColumn(self): + if self.parent != '' and type(self.parent).__name__ == "instance" : + return self.parent.__class__ == CSVColumn + else: + return False + + def fillZeros(self, rows): + if self.hasSubColumns(): + for subcolumn in self.content: + subcolumn.fillZeros(rows) + else: + for i in range(0, rows): + if self.type == "float": + self.content.append(0.0) + elif self.type == "int": + self.content.append(0) + else: + self.content.append('') + + \ No newline at end of file diff --git a/utils/CSVTable.py b/utils/CSVTable.py new file mode 100755 index 0000000..5cab654 --- /dev/null +++ b/utils/CSVTable.py @@ -0,0 +1,178 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- +''' +Created on 09-08-2012 + +@author: Adam Pawlaczek + +TODO: Ew. dodać float do obliczania AVG zamiast rzutowania na typ, w którym jest kolumna. + +''' +# Copyright (C) 2012 Adam Pawlaczek. +# This program is free software; you can redistribute and/or modify it +# under the terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. +# +# See the LICENCE and COPYING files for more details + +from CSVColumn import CSVColumn +from operator import itemgetter + +class CSVTable: + + def __init__(self, separator = ';'): + self.widths = [] + self.content = [] + self.rows = 0 + self.columns = 0 + self.separator = separator + + def addColumn(self, name, type = ''): + for column in self.content: + assert column.name != name, 'Column with name: '+ name+ ' already exists' + column = CSVColumn(self, name, self.separator, type) + self.content.append(column) + + if self.rows > 0 and type != 'dict': + column.fillZeros(self.rows) + self.columns += 1 + return True + + def addSubColumn(self, parentName, name, type = ''): + for column in self.content: + if column.name == parentName: + column.addSubColumn(name, type) + if len(column.content) > 1: + self.columns+=1 + if self.rows > 0: + column.content[-1].fillZeros(self.rows) + return True + return False + + def addRow(self, row={}): + for column in self.content: + if column.name != 'Nr' or 'Nr' in row.keys(): + column.addValue(row[column.name]) + else: + if self.rows > 0: + column.addValue(column.content[self.rows-1] + 1) + else: + column.addValue(1) + self.rows += 1 + + def addEmptyRow(self): + for column in self.content: + if column.name == "Nr": + if self.rows > 0: + column.addValue(column.content[self.rows-1] + 1) + else: + column.addValue(1) + else: + column.fillZeros(1) + self.rows += 1 + + def insertInColumn(self, columnName, row, data, subColumn= ''): + assert row < self.rows + for column in self.content: + if column.name == columnName: + column.insertValue(row, data, subColumn) + + def increment(self, columnName, row, subColumn=''): + for column in self.content: + if column.name == columnName: + column.increment(row, subColumn) + + def getValue(self, columnName, row, subColumn= ''): + assert row < self.rows + for column in self.content: + if column.name == columnName: + return column.getValue(row, subColumn) + + def hasSubColumns(self): + for column in self.content: + if column.hasSubColumns(): + return True + return False + + def hasNrColumn(self): + return self.hasColumn("Nr") + + def hasColumn(self, name): + for column in self.content: + if column.name == name: + return True + return False + + def countAvg(self): + results = {} + for column in self.content: + results[column.name] = column.countSum() + if column.type == 'int' or column.type == 'float': + results[column.name] = results[column.name] / self.rows + elif column.type == 'dict': + for subColumn in results[column.name].keys(): + results[column.name][subColumn] = results[column.name][subColumn] / self.rows + if self.hasNrColumn(): + results['Nr'] = 'AVG' + self.addRow(results) + + + def countSum(self): + results = {} + for column in self.content: + results[column.name] = column.countSum() + if self.hasNrColumn(): + results['Nr'] = 'SUM' + self.addRow(results) + + ''' Helping functions ''' + + def __repeat_to_length(self, string_to_expand, length): + return (string_to_expand * ((length/len(string_to_expand))+1))[:length] + + def ptr(self, data, width): + if type(data).__name__=='int': + return '{0:{width}{base}}'.format(data, base='d', width=width) + self.separator + elif type(data).__name__=='float': + return '{0:{width}{base}}'.format(data, base='.4f', width=width) + self.separator + else: + return '{0:{width}{base}}'.format(data, base='s', width=width) + self.separator + + + def __str__(self): + result = '' + #HEADERS + for column in self.content: + result += self.ptr(column.name, column.width) + if column.hasSubColumns(): + for i in range(1,len(column.content)): + result += self.separator + result += '\n' + #SUBHEADERS + for column in self.content: + if self.hasSubColumns(): + if column.hasSubColumns(): + for subColumn in column.content: + if len(column.content) > 1: + result += self.ptr(subColumn.name, subColumn.width) + else: + result += self.ptr(subColumn.name, column.width) + else: + result += self.ptr(' ', column.width) + result += '\n' + #DATA + for i in range(0, self.rows): #Rows iteration + for column in self.content: #Column iteration + if column.hasSubColumns(): + for subColumn in column.content: + result += self.ptr(subColumn.content[i], subColumn.width) + else: + result += self.ptr(column.content[i], column.width) + result += '\n' + return result + \ No newline at end of file diff --git a/utils/CSVWriter.py b/utils/CSVWriter.py deleted file mode 100644 index deb2e20..0000000 --- a/utils/CSVWriter.py +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/python -#-*- coding: utf-8 -*- -''' -Created on 03-08-2012 - -@author: jezozwierzak -''' - -class CSVWriter: - - def __init__(self, separator = ';'): - self.widths = [] - self.list = [] - self.rows = 0 - self.columns = 0 - self.separator = separator - - def addSubColumn(self, parentIndex, name): - parentColumn = self.list[parentIndex]; - parentColumn.append([name]) - - subColsStr = '' - for i in range(1,len(parentColumn)): - subColsStr += parentColumn[i][0] + self.separator - - if len(subColsStr) > self.widths[parentIndex]: - self.widths[parentIndex] = len(subColsStr) - 1 - - if len(parentColumn[1:]) > 1: - self.columns+=1 - - def addSubColumnByName(self, parentName, name): - assert self.rows == 0, 'You have to add all Column names before adding rows' - parentIndex = self.columnIndex(parentName) - self.addSubColumn(parentIndex, name) - - def addSubColumnsByName(self, parentName, names=[]): - for name in names: - self.addSubColumnByName(parentName, name) - - def addSubColumns(self, parentIndex, names=[]): - for name in names: - self.addSubColumn(parentIndex, name) - - def addColumn(self, name): - assert self.rows == 0, 'You have to add all Column names before adding rows' - self.list.append([name]) - self.widths.append(len(name)) - self.columns+=1 - - def addColumns(self, names=[]): - for name in names: - self.addColumn(name) - - def addRow(self, row=[]): - assert len(row) == len(self.list), 'Wrong number of columns in row' - - for i in range(0,len(self.list)): - column = self.list[i] - if len(column) > 1 and type(column[1]).__name__ == 'list': - #Adding data to subcolumns - assert len(row[i]) == len(column) - 1, 'Wrong number of subColumns in column ' + column[0] - - for j in range(0,len(row[i])): - column[j+1].append(row[i][j]) - - subColsStr = '' - for j in range(0,len(row[i])): - subColsStr += '{0:{base}}'.format(row[i][j], base='.4f') + self.separator + ' ' - - if len(subColsStr) > self.widths[i]: - self.widths[i] = len(subColsStr) - 1 - else: - #Adding data to column - column.append(row[i]) - - if len(str(row[i])) > self.widths[i]: - self.widths[i] = len(str(row[i])) - - self.rows+=1 - - def allWidth(self): - sum = 0 - for width in self.widths: - sum += width - return width - - def columnIndex(self, name): - for column in self.list: - if column[0] == name: - return self.list.index(column) - - def hasSubColumns(self): - for column in self.list: - if len(column) > 1 and type(column[1]).__name__ == 'list': - return True - return False - - def hasColumnSubColumns(self, index): - column = self.list[index] - return len(column) > 1 and type(column[1]).__name__ == 'list' - - def repeat_to_length(self, string_to_expand, length): - return (string_to_expand * ((length/len(string_to_expand))+1))[:length] - - def count_avg(self, ): - results = [] - if not self.hasSubColumns(): - for i in range(0,len(self.list)): #Iterowanie po kolumnach - results.append(0) - for j in range(1,1+self.rows): # Iterowanie po wierszach - results[i]+= self.list[i][j] - results[i]/=self.rows - else: - for i in range(0,len(self.list)): #Iterowanie po kolumnach - - if self.hasColumnSubColumns(i): - subResults = [] - for k in range(0,len(self.list[i][1:])): - subColumn = self.list[i][1:][k] - subResults.append(0) - for j in range(1,1+self.rows): # Iterowanie po wierszach - subResults[k]+= subColumn[j] - subResults[k]/=self.rows - results.append(subResults) - else: - results.append(0) - for j in range(1,1+self.rows): # Iterowanie po wierszach - results[i]+= self.list[i][j] - results[i]/=self.rows - results = results[1:] - results[:0] = ['AVG'] - self.addRow(results) - - def __str__(self): - result = '' - if not self.hasSubColumns(): - for j in range(0,1+self.rows): # Iterowanie po wierszach - for i in range(0, len(self.list)): #Iterowanie po kolumnach - if type(self.list[i][j]).__name__=='int': - result += '{0:{width}{base}}'.format(self.list[i][j], base='d', width=self.widths[i]) + self.separator - elif type(self.list[i][j]).__name__=='float': - result += '{0:{width}{base}}'.format(self.list[i][j], base='.4f', width=self.widths[i]) + self.separator - else: - result += '{0:{width}{base}}'.format(self.list[i][j], base='s', width=self.widths[i]) + self.separator - result += '\n' - else: - #Printing Thead - thead = zip(*self.list)[0] - for i in range(0,len(thead)): - if self.hasColumnSubColumns(i): - numberOfColumns = len(self.list[i][1:]) - result += '{0:{width}{base}}'.format(thead[i], base='s', width=self.widths[i]-numberOfColumns+1) + self.separator - for j in range(1,numberOfColumns): - result += self.separator - else: - result += '{0:{width}{base}}'.format(thead[i], base='s', width=self.widths[i]) + self.separator - result += '\n' - #Printing subTheads: - for i in range(0,len(self.list)): - if self.hasColumnSubColumns(i): - numberOfColumns = len(self.list[i][1:]) - for subColumn in self.list[i][1:]: - result += '{0:{width}{base}}'.format(subColumn[0], base='s', width=(self.widths[i]/numberOfColumns)) + self.separator - else: - result += '{0:{width}{base}}'.format('', base='s', width=self.widths[i]) + self.separator - result += '\n' - #Printing Data - - for j in range(1,1+self.rows): # Iterowanie po wierszach - for i in range(0, len(self.list)): #Iterowanie po kolumnach - if self.hasColumnSubColumns(i): - - for subcolumns in self.list[i][1:]: - if type(subcolumns[j]).__name__=='int': - result += '{0:{width}{base}}'.format(subcolumns[j], base='d', width=(self.widths[i]/numberOfColumns)) + self.separator - elif type(subcolumns[j]).__name__=='float': - result += '{0:{width}{base}}'.format(subcolumns[j], base='.4f', width=(self.widths[i]/numberOfColumns)) + self.separator - else: - result += '{0:{width}{base}}'.format(subcolumns[j], base='s', width=(self.widths[i]/numberOfColumns)) + self.separator - else: - - if type(self.list[i][j]).__name__=='int': - result += '{0:{width}{base}}'.format(self.list[i][j], base='d', width=self.widths[i]) + self.separator - elif type(self.list[i][j]).__name__=='float': - result += '{0:{width}{base}}'.format(self.list[i][j], base='.4f', width=self.widths[i]) + self.separator - else: - result += '{0:{width}{base}}'.format(self.list[i][j], base='s', width=self.widths[i]) + self.separator - result += '\n' - return result - - diff --git a/utils/chunk_eval.py b/utils/chunk_eval.py index 665d517..c0ac9e9 100755 --- a/utils/chunk_eval.py +++ b/utils/chunk_eval.py @@ -2,6 +2,8 @@ #-*- coding: utf-8 -*- ''' Created on 01-08-2012 + +@author: Adam Pawlaczek ''' # Copyright (C) 2012 Adam Pawlaczek. # This program is free software; you can redistribute and/or modify it @@ -15,6 +17,12 @@ Created on 01-08-2012 # # See the LICENCE and COPYING files for more details +from optparse import OptionParser +import corpus2 +import sys, os +from CSVTable import CSVTable +import codecs + descr = """%prog [options] CHUNKED REF Reads the two chunk-annotated corpora: CHUNKED (chunker output) and REF @@ -26,10 +34,6 @@ for the following settings: NOTE: this script treats discontinuous chunks as whole annotations. """ -from optparse import OptionParser -import corpus2 -import sys, os -from CSVWriter import CSVWriter class Stats: def __init__(self): @@ -58,13 +62,17 @@ class Stats: self.head_hits += len(ch.intersection(ref)) def getPRF(self, hits): + result = {} p = 0.0 if self.ch_chunks == 0 else 100.0 * hits / self.ch_chunks r = 0.0 if self.ref_chunks == 0 else 100.0 * hits / self.ref_chunks f = 0.0 if p + r == 0.0 else 2.0 * p * r / (p + r) - return [p, r, f] + result['P'] = p + result['R'] = r + result['F'] = f + return result def getStats(self): - return [self.getPRF(self.chunk_hits)] + return self.getPRF(self.chunk_hits) def get_annots(sent, chan_name): # wrap the sentence as an AnnotatedSentence @@ -106,12 +114,18 @@ def go(): ch_path, ref_path = args main(ch_path, ref_path, options.chunk_names, options.input_format, options.out_path, options.tagset, options.verbose, options.folds) -def main(ch_path, ref_path, chan_name, input_format, out_path, tagset, verbose, folds): +def main(ch_path, ref_path, chan_names, input_format, out_path, tagset, verbose, folds): + + chan_names = chan_names.split(",") - csvWriter = CSVWriter(",") - - csvWriter.addColumns(["Nr ","Chunk"]) - csvWriter.addSubColumnsByName("Chunk", ["P", "R", "F"]) + csvTable = CSVTable(";") + csvTable.addColumn('Nr') + + for chan_name in chan_names: + csvTable.addColumn(chan_name) + csvTable.addSubColumn(chan_name, "P", type="float") + csvTable.addSubColumn(chan_name, "R", type="float") + csvTable.addSubColumn(chan_name, "F", type="float") tagset = corpus2.get_named_tagset(tagset) @@ -122,35 +136,46 @@ def main(ch_path, ref_path, chan_name, input_format, out_path, tagset, verbose, else: ch_path_fold = ch_path ref_path_fold = ref_path - - ch_rdr = corpus2.TokenReader.create_path_reader( + + results = {} + + for chan_name in chan_names: + + ch_rdr = corpus2.TokenReader.create_path_reader( input_format, tagset, ch_path_fold) - ref_rdr = corpus2.TokenReader.create_path_reader( + ref_rdr = corpus2.TokenReader.create_path_reader( input_format, tagset, ref_path_fold) - - stats = Stats() - - while True: - # iterate over paragraphs (note that they are called "chunks" here) - ref_chunk = ref_rdr.get_next_chunk() - ch_chunk = ch_rdr.get_next_chunk() - assert (not ref_chunk) == (not ch_chunk), 'corpora of different length' - if not ref_chunk: - break # end of input + stats = Stats() - # process each sentence separately - for ch_sent, ref_sent in zip(ch_chunk.sentences(), ref_chunk.sentences()): - assert ch_sent.size() == ref_sent.size() - ch_annots = get_annots(ch_sent, chan_name) - ref_annots = get_annots(ref_sent, chan_name) - stats.update(ch_annots, ref_annots) + while True: + # iterate over paragraphs (note that they are called "chunks" here) + ref_chunk = ref_rdr.get_next_chunk() + ch_chunk = ch_rdr.get_next_chunk() + assert (not ref_chunk) == (not ch_chunk), 'corpora of different length' + + if not ref_chunk: + break # end of input + + # process each sentence separately + for ch_sent, ref_sent in zip(ch_chunk.sentences(), ref_chunk.sentences()): + assert ch_sent.size() == ref_sent.size() + ch_annots = get_annots(ch_sent, chan_name) + ref_annots = get_annots(ref_sent, chan_name) + stats.update(ch_annots, ref_annots) - results = stats.getStats() - results[:0] = [fold] - csvWriter.addRow(results) - csvWriter.count_avg() - print csvWriter + results[chan_name] = stats.getStats() + + csvTable.addRow(results) + if folds > 1: + csvTable.countAvg() + + if out_path != '': + out = codecs.open(out_path, "w", "utf-8") + out.close() + else: + print csvTable + if __name__ == '__main__': go() \ No newline at end of file -- GitLab