Commit ce87e7d3 authored by Tomasz Walkowiak's avatar Tomasz Walkowiak

category with left and right context

parent 52db134e
Pipeline #2786 passed with stages
in 1 minute and 47 seconds
......@@ -78,6 +78,8 @@ class Category:
"""."""
res = ""
space = ""
if sentence is None:
return ""
for token in list(sentence):
if token.tag == "ns":
space = ""
......@@ -86,14 +88,17 @@ class Category:
space = " "
return res
def add(self, stat, cat, what, sentence):
def add(self, stat, cat, what, sentence, sentence_left, sentence_right):
"""."""
stat = stat[KEY]
if cat not in stat:
stat[cat] = {}
if what not in stat[cat]:
stat[cat][what] = []
stat[cat][what].append(self.to_sentence(sentence))
stat[cat][what].append({'s': self.to_sentence(sentence),
'sl': self.to_sentence(sentence_left),
'sr': self.to_sentence(sentence_right),
})
def process(self, inputFile, taskOptions, outputFile, XLSXpath):
"""."""
......@@ -103,8 +108,12 @@ class Category:
model = self.get_model(path)
tree = ET.parse(inputFile)
stat = {"tokens": 0, KEY: {}}
sentences = list(tree.iter("sentence"))
for si in range(len(sentences)):
sentence = sentences[si]
sl = sentences[si - 1] if si > 1 else None
sr = sentences[si + 1] if si < len(sentences) - 1 else None
for sentence in tree.iter("sentence"):
for token in sentence.iter("tok"):
base = token.find("./lex/base").text
stat["tokens"] += 1
......@@ -120,7 +129,8 @@ class Category:
id = str(prop.text)
if (id in model.synsets and
model.synsets[id][1] == mwe_base):
self.add(stat, model.synsets[id][0], id, sentence)
self.add(stat, model.synsets[id][0], id, sentence,
sl, sr)
self.inc(stat, model.synsets[id][0])
if self.verbose:
print("Synset %s, category %s" % (id,
......@@ -136,7 +146,7 @@ class Category:
if (el in model.variants and
model.variants[el][1] == mwe_base):
self.add(stat, model.variants[el][0], el,
sentence)
sentence, sl, sr)
self.inc(stat, model.variants[el][0])
if self.verbose:
print("Variant %s, category %s" % (el,
......@@ -148,7 +158,8 @@ class Category:
if not found:
if base in model.lemmas:
self.add(stat, model.lemmas[base], base, sentence)
self.add(stat, model.lemmas[base], base, sentence, sl,
sr)
self.inc(stat, model.lemmas[base])
if self.verbose:
print("Base %s, category %s" % (base,
......@@ -165,6 +176,8 @@ def main():
cat = Category(verbose=True)
cat.process("./test/tet3.ccl", {"path": "/test/as.xlsx", "full": True},
"./test/test3_out.json", ".")
cat.process("./test/test2.ccl", {"path": "/test/as.xlsx", "full": True},
"./test/test2_out.json", ".")
if __name__ == "__main__":
......
This diff is collapsed.
{"tokens":43,"__SENTENCES__":{"Insight2":{"dowiedzie\u0107_si\u0119.2":["W szczeg\u00f3lno\u015bci kiedy dowiedzia\u0142am si\u0119 o implikacjach tego faktu."],"droga":["Jestem \u015bwiadoma, \u017ce nie jest to naj\u0142atwiejsza droga osi\u0105gni\u0119cia celu, ale domy\u015blam si\u0119, \u017ce musz\u0119 po prostu przestawi\u0107 si\u0119 do nowej rzeczywisto\u015bci."]}},"Insight2":2}
\ No newline at end of file
{"tokens":43,"__SENTENCES__":{"Insight2":{"dowiedzie\u0107_si\u0119.2":[{"s":"W szczeg\u00f3lno\u015bci kiedy dowiedzia\u0142am si\u0119 o implikacjach tego faktu.","sl":"","sr":"Jestem \u015bwiadoma, \u017ce nie jest to naj\u0142atwiejsza droga osi\u0105gni\u0119cia celu, ale domy\u015blam si\u0119, \u017ce musz\u0119 po prostu przestawi\u0107 si\u0119 do nowej rzeczywisto\u015bci."}],"droga":[{"s":"Jestem \u015bwiadoma, \u017ce nie jest to naj\u0142atwiejsza droga osi\u0105gni\u0119cia celu, ale domy\u015blam si\u0119, \u017ce musz\u0119 po prostu przestawi\u0107 si\u0119 do nowej rzeczywisto\u015bci.","sl":"W szczeg\u00f3lno\u015bci kiedy dowiedzia\u0142am si\u0119 o implikacjach tego faktu.","sr":""}]}},"Insight2":2}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment