Changed CSVWriter to CSVTable

d55d81ca · jezozwierzak · 493c914d · d55d81ca · d55d81ca · 493c914d
Commit d55d81ca authored Aug 23, 2012 by jezozwierzak
--- a/utils/CSVColumn.py
+++ b/utils/CSVColumn.py
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+'''
+Created on 09-08-2012
+
+@author: Adam Pawlaczek
+'''
+# Copyright (C) 2012 Adam Pawlaczek.
+# This program is free software; you can redistribute and/or modify it
+# under the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the LICENCE and COPYING files for more details
+
+class CSVColumn:
+    
+    def __init__(self, parent, name, separator, type):
+        self.parent = parent
+        self.name = name
+        self.type = type
+        self.width = len(name)
+        self.content = []
+        self.separator = separator
+        
+    def addSubColumn(self, name, type = ''):
+        assert len(self.content) == 0 or self.hasSubColumns()
+        self.type = 'dict'
+        self.content.append(CSVColumn(self, name, self.separator, type))
+        self.recountWidths()
+        
+    def insertValue(self, row, data, subColumn =''):
+                
+        if self.hasSubColumns() and type(data).__name__ == 'dict':
+            for subColumn in self.content:
+                subColumn.insertValue(data[subColumn.name], row)
+        elif self.hasSubColumns():
+            for sub in self.content:
+                if sub.name == subColumn:
+                    sub.content[row] = data
+                    if type(data).__name__ == "float":
+                        data_str = "%.4f"%data
+                    else:
+                        data_str = str(data)
+                    if len(data_str) > sub.width:
+                        sub.width = len(data_str)
+                    break
+        elif subColumn == '':
+            self.content[row] = data
+            if type(data).__name__ == "float":
+                data_str = "%.4f"%data
+            else:
+                data_str = str(data)
+            if len(data_str) > self.width:
+                self.width = len(data_str)
+    
+    def increment(self, row, subColumn=''):
+        if subColumn != '' and self.hasSubColumns():
+            for sub in self.content:
+                if sub.name == subColumn:
+                    sub.content[row] += 1
+                    if sub.type == "float":
+                        data_str = "%.4f"%sub.content[row]
+                    else:
+                        data_str = str(sub.content[row])
+                    if len(data_str) > sub.width:
+                        self.width += len(data_str) - sub.width
+                        sub.width = len(data_str)
+                    break
+        else:
+            self.content[row] += 1
+            
+    
+    def getValue(self, row, subColumn =''):
+        if subColumn != '' and self.hasSubColumns():
+            for sub in self.content:
+                if sub.name == subColumn:
+                    return sub.content[row]
+        else:
+            return self.content[row]
+    
+    def addValue(self, data):
+        if self.hasSubColumns():
+            str_data = ''
+            for sub in self.content:
+                sub.addValue(data[sub.name])
+                str_data += str(data[sub.name]) + self.separator
+            if len(str_data) > self.width:
+                self.recountWidths()
+        else:
+            assert type(data).__name__ != 'dict', 'You added dict to column with no subcolumns'
+            if len(self.content) == 0:
+                self.type = type(data).__name__
+            self.content.append(data)
+            if len(str(data)) > self.width and self.isSubColumn():
+                self.parent.recountWidths()
+            elif len(str(data) + self.separator) > self.width:
+                self.width = len(str(data))
+        
+    def countSum(self):
+        if self.hasSubColumns():
+            result = {}
+            for sub in self.content:
+                result[sub.name] = sub.countSum()
+        else:
+            if self.type != 'string':
+                result = 0
+                for row in self.content:
+                    result += row
+        return result
+        
+    def recountWidths(self):
+        subColsStr = ''
+        for subColumn in self.content:
+            subColsStr += subColumn.name + self.separator
+            
+        if len(subColsStr) > self.width:
+            self.width = len(subColsStr)
+            for subColumn in self.content:
+                subColumn.width = len(subColumn.name + self.separator) 
+        elif self.parent.rows > 0:
+            widths = 0
+            for i in range(0,len(self.content)): #Dla każdej podkolumny
+                for j in range(0, self.parent.rows): #Dla każdego wiersza 
+                    if i == len(self.content) - 1:
+                        if self.width - widths > 0:
+                            self.content[i].width = self.width - widths
+                        else:
+                            self.content[i].width = len(self.parent.ptr(self.content[i].content[j], 1))
+                        break
+                    elif len(self.parent.ptr(self.content[i].content[j], 1)) > self.content[i].width:
+                        self.content[i].width = len(self.parent.ptr(self.content[i].content[j], 1))
+                widths += self.content[i].width
+            if widths > self.width:
+                self.width = widths
+        
+    def hasSubColumns(self):
+        if len(self.content) > 0 and type(self.content[0]).__name__ == "instance":
+            return self.content[0].__class__ == CSVColumn
+        else:
+            return False
+        
+    def isSubColumn(self):
+        if self.parent != '' and type(self.parent).__name__ == "instance" :
+            return self.parent.__class__ == CSVColumn
+        else:
+            return False
+        
+    def fillZeros(self, rows):
+        if self.hasSubColumns():
+            for subcolumn in self.content:
+                subcolumn.fillZeros(rows)
+        else:
+            for i in range(0, rows):
+                if self.type == "float":
+                    self.content.append(0.0)
+                elif self.type == "int":
+                    self.content.append(0)
+                else:
+                    self.content.append('')
+    
+    
\ No newline at end of file
--- a/utils/CSVTable.py
+++ b/utils/CSVTable.py
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+'''
+Created on 09-08-2012
+
+@author: Adam Pawlaczek
+
+TODO: Ew. dodać float do obliczania AVG zamiast rzutowania na typ, w którym jest kolumna.
+
+'''
+# Copyright (C) 2012 Adam Pawlaczek.
+# This program is free software; you can redistribute and/or modify it
+# under the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the LICENCE and COPYING files for more details
+
+from CSVColumn import CSVColumn
+from operator import itemgetter
+
+class CSVTable:
+    
+    def __init__(self, separator = ';'):
+        self.widths = []
+        self.content  = []
+        self.rows = 0
+        self.columns = 0
+        self.separator = separator
+
+    def addColumn(self, name, type = ''):
+        for column in self.content:
+            assert column.name != name, 'Column with name: '+ name+ ' already exists'
+        column = CSVColumn(self, name, self.separator, type)
+        self.content.append(column)
+        
+        if self.rows > 0 and type != 'dict':
+            column.fillZeros(self.rows)
+        self.columns += 1
+        return True
+        
+    def addSubColumn(self, parentName, name, type = ''):
+        for column in self.content:
+            if column.name == parentName:
+                column.addSubColumn(name, type)
+                if len(column.content) > 1:
+                    self.columns+=1
+                if self.rows > 0:
+                    column.content[-1].fillZeros(self.rows)
+                return True
+        return False
+        
+    def addRow(self, row={}):
+        for column in self.content:
+            if column.name != 'Nr' or 'Nr' in row.keys():
+                column.addValue(row[column.name])
+            else:
+                if self.rows > 0:
+                    column.addValue(column.content[self.rows-1] + 1)
+                else:
+                    column.addValue(1)
+        self.rows += 1
+    
+    def addEmptyRow(self):
+        for column in self.content:
+            if column.name == "Nr":
+                if self.rows > 0:
+                    column.addValue(column.content[self.rows-1] + 1)
+                else:
+                    column.addValue(1)
+            else:
+                column.fillZeros(1)
+        self.rows += 1
+    
+    def insertInColumn(self, columnName, row, data, subColumn= ''):
+        assert row < self.rows
+        for column in self.content:
+            if column.name == columnName:
+                column.insertValue(row, data, subColumn)
+                
+    def increment(self, columnName, row, subColumn=''):
+        for column in self.content:
+            if column.name == columnName:
+                column.increment(row, subColumn)
+        
+    def getValue(self, columnName, row, subColumn= ''):
+        assert row < self.rows
+        for column in self.content:
+            if column.name == columnName:
+                return column.getValue(row, subColumn) 
+        
+    def hasSubColumns(self):
+        for column in self.content:
+            if column.hasSubColumns():
+                return True
+        return False
+    
+    def hasNrColumn(self):
+        return self.hasColumn("Nr")
+    
+    def hasColumn(self, name):
+        for column in self.content:
+            if column.name == name:
+                return True
+        return False
+    
+    def countAvg(self):
+        results = {}
+        for column in self.content:
+            results[column.name] = column.countSum()
+            if column.type == 'int' or column.type == 'float':
+                results[column.name] = results[column.name] / self.rows
+            elif column.type == 'dict':
+                for subColumn in results[column.name].keys():
+                    results[column.name][subColumn] = results[column.name][subColumn] / self.rows
+        if self.hasNrColumn():
+            results['Nr'] = 'AVG'
+        self.addRow(results)
+
+        
+    def countSum(self):
+        results = {}
+        for column in self.content:
+            results[column.name] = column.countSum()
+        if self.hasNrColumn():
+            results['Nr'] = 'SUM'
+        self.addRow(results)
+    
+    ''' Helping functions '''
+        
+    def __repeat_to_length(self, string_to_expand, length):
+        return (string_to_expand * ((length/len(string_to_expand))+1))[:length]
+    
+    def ptr(self, data, width):
+        if type(data).__name__=='int':
+            return '{0:{width}{base}}'.format(data, base='d', width=width) + self.separator
+        elif type(data).__name__=='float':
+            return '{0:{width}{base}}'.format(data, base='.4f', width=width) + self.separator
+        else:
+            return '{0:{width}{base}}'.format(data, base='s', width=width) + self.separator
+            
+    
+    def __str__(self):
+        result = ''
+        #HEADERS
+        for column in self.content:
+            result += self.ptr(column.name, column.width)
+            if column.hasSubColumns():
+                for i in range(1,len(column.content)):
+                    result += self.separator
+        result += '\n'                  
+        #SUBHEADERS
+        for column in self.content:
+            if self.hasSubColumns():
+                if column.hasSubColumns():
+                    for subColumn in column.content:
+                        if len(column.content) > 1:
+                            result += self.ptr(subColumn.name, subColumn.width)
+                        else:
+                            result += self.ptr(subColumn.name, column.width)
+                else:
+                    result += self.ptr(' ', column.width)
+        result += '\n'
+        #DATA         
+        for i in range(0, self.rows):     #Rows   iteration
+            for column in self.content:   #Column iteration
+                if column.hasSubColumns():
+                    for subColumn in column.content:
+                        result += self.ptr(subColumn.content[i], subColumn.width)
+                else:
+                    result += self.ptr(column.content[i], column.width)
+            result += '\n'    
+        return result
+    
\ No newline at end of file
--- a/utils/CSVWriter.py
+++ b/utils/CSVWriter.py
-#!/usr/bin/python
-#-*- coding: utf-8 -*-
-'''
-Created on 03-08-2012
-
-@author: jezozwierzak
-'''
-
-class CSVWriter:
-        
-    def __init__(self, separator = ';'):
-        self.widths = []
-        self.list  = []
-        self.rows = 0
-        self.columns = 0
-        self.separator = separator
-    
-    def addSubColumn(self, parentIndex, name):
-        parentColumn = self.list[parentIndex];
-        parentColumn.append([name])
-        
-        subColsStr = ''
-        for i in range(1,len(parentColumn)):
-            subColsStr += parentColumn[i][0] + self.separator
-            
-        if len(subColsStr) > self.widths[parentIndex]:
-            self.widths[parentIndex] = len(subColsStr) - 1
-        
-        if len(parentColumn[1:]) > 1:
-            self.columns+=1
-    
-    def addSubColumnByName(self, parentName, name):
-        assert self.rows == 0, 'You have to add all Column names before adding rows'
-        parentIndex = self.columnIndex(parentName)
-        self.addSubColumn(parentIndex, name)
-    
-    def addSubColumnsByName(self, parentName, names=[]):
-        for name in names:
-            self.addSubColumnByName(parentName, name)
-    
-    def addSubColumns(self, parentIndex, names=[]):
-        for name in names:
-            self.addSubColumn(parentIndex, name)
-    
-    def addColumn(self, name):
-        assert self.rows == 0, 'You have to add all Column names before adding rows'
-        self.list.append([name])
-        self.widths.append(len(name))
-        self.columns+=1
-    
-    def addColumns(self, names=[]):
-        for name in names:
-            self.addColumn(name)
-    
-    def addRow(self, row=[]):
-        assert len(row) == len(self.list), 'Wrong number of columns in row'
-        
-        for i in range(0,len(self.list)):
-            column = self.list[i]
-            if len(column) > 1 and type(column[1]).__name__ == 'list':
-                #Adding data to subcolumns
-                assert len(row[i]) == len(column) - 1, 'Wrong number of subColumns in column ' + column[0]
-                
-                for j in range(0,len(row[i])):
-                    column[j+1].append(row[i][j])
-
-                subColsStr = ''
-                for j in range(0,len(row[i])):
-                    subColsStr += '{0:{base}}'.format(row[i][j], base='.4f') + self.separator + ' '
-                
-                if len(subColsStr) > self.widths[i]:
-                    self.widths[i] = len(subColsStr) - 1
-            else:
-                #Adding data to column
-                column.append(row[i])
-                
-                if len(str(row[i])) > self.widths[i]:
-                    self.widths[i] = len(str(row[i]))
-                    
-        self.rows+=1           
-
-    def allWidth(self):
-        sum = 0
-        for width in self.widths:
-            sum += width
-        return width
-    
-    def columnIndex(self, name):
-        for column in self.list:
-            if column[0] == name:
-                return self.list.index(column)
-    
-    def hasSubColumns(self):
-        for column in self.list:
-            if len(column) > 1 and type(column[1]).__name__ == 'list':
-                return True
-        return False
-    
-    def hasColumnSubColumns(self, index):
-        column = self.list[index]
-        return len(column) > 1 and type(column[1]).__name__ == 'list'
-        
-    def repeat_to_length(self, string_to_expand, length):
-        return (string_to_expand * ((length/len(string_to_expand))+1))[:length]
-
-    def count_avg(self, ):
-        results = [] 
-        if not self.hasSubColumns():
-            for i in range(0,len(self.list)): #Iterowanie po kolumnach
-                results.append(0)
-                for j in range(1,1+self.rows): # Iterowanie po wierszach
-                    results[i]+= self.list[i][j]
-                results[i]/=self.rows
-        else:
-            for i in range(0,len(self.list)): #Iterowanie po kolumnach
-                
-                if self.hasColumnSubColumns(i):
-                    subResults = []
-                    for k in range(0,len(self.list[i][1:])):
-                        subColumn = self.list[i][1:][k]
-                        subResults.append(0)
-                        for j in range(1,1+self.rows): # Iterowanie po wierszach
-                            subResults[k]+= subColumn[j]
-                        subResults[k]/=self.rows
-                    results.append(subResults)
-                else:
-                    results.append(0)
-                    for j in range(1,1+self.rows): # Iterowanie po wierszach
-                        results[i]+= self.list[i][j]
-                    results[i]/=self.rows
-        results = results[1:]
-        results[:0] = ['AVG']
-        self.addRow(results)
-
-    def __str__(self):
-        result = ''
-        if not self.hasSubColumns():
-            for j in range(0,1+self.rows): # Iterowanie po wierszach
-                for i in range(0, len(self.list)): #Iterowanie po kolumnach
-                    if type(self.list[i][j]).__name__=='int':
-                        result += '{0:{width}{base}}'.format(self.list[i][j], base='d', width=self.widths[i]) + self.separator
-                    elif type(self.list[i][j]).__name__=='float':
-                        result += '{0:{width}{base}}'.format(self.list[i][j], base='.4f', width=self.widths[i]) + self.separator
-                    else:
-                        result += '{0:{width}{base}}'.format(self.list[i][j], base='s', width=self.widths[i]) + self.separator
-                result += '\n'
-        else:
-            #Printing Thead
-            thead = zip(*self.list)[0]
-            for i in range(0,len(thead)):
-                if self.hasColumnSubColumns(i):
-                    numberOfColumns = len(self.list[i][1:])
-                    result += '{0:{width}{base}}'.format(thead[i], base='s', width=self.widths[i]-numberOfColumns+1) + self.separator
-                    for j in range(1,numberOfColumns):
-                        result += self.separator
-                else:
-                    result += '{0:{width}{base}}'.format(thead[i], base='s', width=self.widths[i]) + self.separator
-            result += '\n'
-            #Printing subTheads:
-            for i in range(0,len(self.list)):
-                if self.hasColumnSubColumns(i):
-                    numberOfColumns = len(self.list[i][1:])
-                    for subColumn in self.list[i][1:]:
-                        result += '{0:{width}{base}}'.format(subColumn[0], base='s', width=(self.widths[i]/numberOfColumns)) + self.separator
-                else:
-                    result += '{0:{width}{base}}'.format('', base='s', width=self.widths[i]) + self.separator
-            result += '\n'
-            #Printing Data
-
-            for j in range(1,1+self.rows): # Iterowanie po wierszach
-                for i in range(0, len(self.list)): #Iterowanie po kolumnach
-                    if self.hasColumnSubColumns(i):
-                        
-                        for subcolumns in self.list[i][1:]:
-                            if type(subcolumns[j]).__name__=='int':
-                                result += '{0:{width}{base}}'.format(subcolumns[j], base='d', width=(self.widths[i]/numberOfColumns)) + self.separator
-                            elif type(subcolumns[j]).__name__=='float':
-                                result += '{0:{width}{base}}'.format(subcolumns[j], base='.4f', width=(self.widths[i]/numberOfColumns)) + self.separator
-                            else:
-                                result += '{0:{width}{base}}'.format(subcolumns[j], base='s', width=(self.widths[i]/numberOfColumns)) + self.separator
-                    else:
-                        
-                        if type(self.list[i][j]).__name__=='int':
-                            result += '{0:{width}{base}}'.format(self.list[i][j], base='d', width=self.widths[i]) + self.separator
-                        elif type(self.list[i][j]).__name__=='float':
-                            result += '{0:{width}{base}}'.format(self.list[i][j], base='.4f', width=self.widths[i]) + self.separator
-                        else:
-                            result += '{0:{width}{base}}'.format(self.list[i][j], base='s', width=self.widths[i]) + self.separator
-                result += '\n'
-        return result
-
-
--- a/utils/chunk_eval.py
+++ b/utils/chunk_eval.py
@@ -2,6 +2,8 @@
 #-*- coding: utf-8 -*-
 '''
 Created on 01-08-2012
+
+@author: Adam Pawlaczek
 '''
 # Copyright (C) 2012 Adam Pawlaczek.
 # This program is free software; you can redistribute and/or modify it
@@ -15,6 +17,12 @@ Created on 01-08-2012
 #
 # See the LICENCE and COPYING files for more details

+from optparse import OptionParser
+import corpus2
+import sys, os
+from CSVTable import CSVTable
+import codecs
+
 descr = """%prog [options] CHUNKED REF

 Reads the two chunk-annotated corpora: CHUNKED (chunker output) and REF
@@ -26,10 +34,6 @@ for the following settings:

 NOTE: this script treats discontinuous chunks as whole annotations.
 """
-from optparse import OptionParser
-import corpus2
-import sys, os
-from CSVWriter import CSVWriter

 class Stats:
    def __init__(self):
@@ -58,13 +62,17 @@ class Stats:
        self.head_hits += len(ch.intersection(ref))
    
    def getPRF(self, hits):
+        result = {}
        p = 0.0 if self.ch_chunks == 0 else 100.0 * hits / self.ch_chunks
        r = 0.0 if self.ref_chunks == 0 else 100.0 * hits / self.ref_chunks
        f = 0.0 if p + r == 0.0 else 2.0 * p * r / (p + r)
-        return [p, r, f]
+        result['P'] = p
+        result['R'] = r
+        result['F'] = f
+        return result
    
    def getStats(self):
-        return [self.getPRF(self.chunk_hits)]
+        return self.getPRF(self.chunk_hits)

 def get_annots(sent, chan_name):
    # wrap the sentence as an AnnotatedSentence
@@ -106,12 +114,18 @@ def go():
    ch_path, ref_path = args
    main(ch_path, ref_path, options.chunk_names, options.input_format, options.out_path, options.tagset, options.verbose, options.folds)
    
-def main(ch_path, ref_path, chan_name, input_format, out_path, tagset, verbose, folds):
+def main(ch_path, ref_path, chan_names, input_format, out_path, tagset, verbose, folds):

-    csvWriter = CSVWriter(",")
+    chan_names = chan_names.split(",")
    
-    csvWriter.addColumns(["Nr ","Chunk"])
-    csvWriter.addSubColumnsByName("Chunk", ["P", "R", "F"])
+    csvTable = CSVTable(";")
+    csvTable.addColumn('Nr')
+    
+    for chan_name in chan_names:        
+        csvTable.addColumn(chan_name)
+        csvTable.addSubColumn(chan_name, "P", type="float")
+        csvTable.addSubColumn(chan_name, "R", type="float")
+        csvTable.addSubColumn(chan_name, "F", type="float")
    
    tagset = corpus2.get_named_tagset(tagset)
    
@@ -123,6 +137,10 @@ def main(ch_path, ref_path, chan_name, input_format, out_path, tagset, verbose,
            ch_path_fold = ch_path
            ref_path_fold = ref_path

+        results = {}
+        
+        for chan_name in chan_names:
+            
            ch_rdr = corpus2.TokenReader.create_path_reader(
                input_format, tagset, ch_path_fold)
            ref_rdr = corpus2.TokenReader.create_path_reader(
@@ -146,11 +164,18 @@ def main(ch_path, ref_path, chan_name, input_format, out_path, tagset, verbose,
                    ref_annots = get_annots(ref_sent, chan_name)
                    stats.update(ch_annots, ref_annots)
        
-        results = stats.getStats()
-        results[:0] = [fold]
-        csvWriter.addRow(results)
-    csvWriter.count_avg()
-    print csvWriter
+            results[chan_name] = stats.getStats()
+            
+        csvTable.addRow(results)
+    if folds > 1:
+        csvTable.countAvg()
+    
+    if out_path != '':
+        out = codecs.open(out_path, "w", "utf-8")
+        out.close()
+    else:
+        print csvTable
+    
    
 if __name__ == '__main__':
    go()
\ No newline at end of file