diff --git a/plwn/readers/nodes.py b/plwn/readers/nodes.py index b1bf8d3cafafd96ae67ee3c0012489bd71d6fb7e..9f58999afc307145030394a1894feafd2ef1a914 100644 --- a/plwn/readers/nodes.py +++ b/plwn/readers/nodes.py @@ -31,19 +31,19 @@ __all__ = ( SynsetNode = namedtuple( "SynsetNode", - ["id", "definition", "related", "is_artificial"], + ["id", "legacy_id", "definition", "related", "is_artificial"], ) LexicalUnitNode = namedtuple( "LexicalUnitNode", - ["id", "lemma", "pos", "variant", "synset", "unit_index", "definition", - "usage_notes", "external_links", "examples", "examples_sources", - "domain", "related", "verb_aspect", "is_emotional", "emotion_markedness", - "emotion_names", "emotion_valuations", "emotion_example_1", - "emotion_example_2"] + ["id", "legacy_id", "lemma", "pos", "variant", "synset", "unit_index", + "definition", "usage_notes", "external_links", "examples", + "examples_sources", "domain", "related", "verb_aspect", "is_emotional", + "emotion_markedness", "emotion_names", "emotion_valuations", + "emotion_example_1", "emotion_example_2"] ) RelationTypeNode = namedtuple( "RelationTypeNode", - ["kind", "name", "parent", "aliases"], + ["kind", "legacy_id", "name", "parent", "aliases"], ) @@ -58,6 +58,7 @@ def make_synset_node(**props): """ syn = SynsetNode( id=props.pop('id'), + legacy_id=props.pop('legacy_id', None), definition=props.pop('definition', None), related=props.pop('related', ()), is_artificial=props.pop('is_artificial', False), @@ -80,6 +81,7 @@ def make_lexical_unit_node(**props): """ lex = LexicalUnitNode( id=props.pop('id'), + legacy_id=props.pop('legacy_id', None), lemma=props.pop('lemma'), pos=props.pop('pos'), variant=props.pop('variant'), @@ -120,6 +122,7 @@ def make_relation_type_node(**props): """ rel = RelationTypeNode( name=props.pop('name'), + legacy_id=props.pop('legacy_id'), kind=props.pop('kind'), parent=props.pop('parent', None), aliases=props.pop('aliases', frozenset()), diff --git a/plwn/readers/ubylmf.py b/plwn/readers/ubylmf.py index a3859ef5bc91737206c05d4c442103d09ad10a00..f91d2d3210c0d9a81565d8c2cd86432b2d434bdd 100644 --- a/plwn/readers/ubylmf.py +++ b/plwn/readers/ubylmf.py @@ -136,6 +136,7 @@ def _make_lexicalunit(xml_lexicalentry, xml_sense): ) return make_lexical_unit_node( id=lu_id, + legacy_id=None, lemma=lu_lemma, pos=PoS(lu_pos), synset=lu_synset, diff --git a/plwn/storages/sqlite.py b/plwn/storages/sqlite.py index 6f48144e9079de4eef93216b220df6b4bcd31854..904e5400ce632a08d27da49e89f9ebb99713ed27 100644 --- a/plwn/storages/sqlite.py +++ b/plwn/storages/sqlite.py @@ -108,7 +108,7 @@ CREATE UNIQUE INDEX value ON tbl_domain (value); -- Synset only gets one simple table CREATE TABLE tbl_synset ( - id BLOB NOT NULL , + id BLOB , legacy_id INTEGER NULL , definition TEXT NULL , isartificial INTEGER NOT NULL DEFAULT '0' , @@ -119,7 +119,7 @@ CREATE TABLE tbl_synset ( -- properties. They also need indexes for lookup. CREATE TABLE tbl_lexicalunit ( - id BLOB NOT NULL , + id BLOB , legacy_id INTEGER NULL , lemma VARCHAR(255) NOT NULL , pos INTEGER NOT NULL , @@ -204,14 +204,14 @@ CREATE UNIQUE INDEX name ON tbl_lexicalrelationchildpart (name); -- Parent can't be NULL - the no-parent case will be handled by a special empty -- string parent. This is so that UNIQUE works correctly. CREATE TABLE tbl_synsetrelationtype ( - id BLOB NOT NULL , + id BLOB , legacy_id INTEGER NULL , parentpart INTEGER NOT NULL , childpart INTEGER NOT NULL , PRIMARY KEY (id) ); CREATE TABLE tbl_lexicalrelationtype ( - id BLOB NOT NULL , + id BLOB , legacy_id INTEGER NULL , parentpart INTEGER NOT NULL , childpart INTEGER NOT NULL , @@ -1265,6 +1265,19 @@ class RelationInfo(bs.RelationInfoBase): self._par = _UNFETCHED self._name = _UNFETCHED self._aliases = _UNFETCHED + self._leg_id = _UNFETCHED + + @property + def legacy_id(self): + if self._leg_id is _UNFETCHED: + with closing(self._db.cursor()) as cur: + cur.execute( + u"SELECT legacy_id FROM {} WHERE id = ?" + .format(_RELTYPE_TABLES[self._kind]), + (self._id,), + ) + self._leg_id = cur.fetchone()[0] + return self._leg_id @property def kind(self): @@ -1387,10 +1400,11 @@ class _DBBuilder(object): def _insert_synset(self, syn_node): self._db.execute( u""" - INSERT INTO tbl_synset (id, definition, isartificial) - VALUES (?, ?, ?) + INSERT INTO tbl_synset (id, legacy_id, definition, isartificial) + VALUES (?, ?, ?, ?) """, - (syn_node.id, syn_node.definition, syn_node.is_artificial), + (syn_node.id, syn_node.legacy_id, syn_node.definition, + syn_node.is_artificial), ).close() # Related go into temp storage self._adhoc_synrels[syn_node.id] = syn_node.related @@ -1416,9 +1430,11 @@ class _DBBuilder(object): childname_id = self._ensure_rel_part_name(child_tbl, rel_node.name) # And now the relation itself cur.execute( - u"INSERT INTO {} (parentpart, childpart) VALUES (?, ?)" - .format(type_tbl), - (parname_id, childname_id), + u""" + INSERT INTO {} (legacy_id, parentpart, childpart) + VALUES (?, ?, ?) + """.format(type_tbl), + (rel_node.legacy_id, parname_id, childname_id), ) # Do aliases if present if rel_node.aliases: @@ -1439,14 +1455,14 @@ class _DBBuilder(object): cur.execute( u""" INSERT INTO tbl_lexicalunit ( - id, lemma, pos, variant, + id, legacy_id, lemma, pos, variant, synset, unitindex, definition, domain, verbaspect, isemotional, emotionmark, emotionexample1, emotionexample2 ) VALUES ( - :id, :lemma, :pos, :var, + :id, :legacy_id, :lemma, :pos, :var, :syn, :uidx, :def, :dom, :va, :emo_is, :emo_m, @@ -1455,6 +1471,7 @@ class _DBBuilder(object): """, { u'id': lu_node.id, + u'legacy_id': lu_node.legacy_id, u'lemma': lu_node.lemma, u'pos': self._posids[lu_node.pos.value], u'var': lu_node.variant, @@ -1607,7 +1624,10 @@ class _DBBuilder(object): return row[0] cur.execute( - u"INSERT INTO {} (parentpart, childpart) VALUES (?, ?)" + u""" + INSERT INTO {} (parentpart, childpart) + VALUES (?, ?) + """ .format(type_tbl), (empty_parent_id, child_id), ) diff --git a/tests/abstract_cases/test_unit_and_synset.py b/tests/abstract_cases/test_unit_and_synset.py index 626e50f2161d79f5026a2152a3becc2392617fd9..f3885dea2aae8871ef7ba7b48fe087ee43b98a05 100644 --- a/tests/abstract_cases/test_unit_and_synset.py +++ b/tests/abstract_cases/test_unit_and_synset.py @@ -29,9 +29,10 @@ class SynsetPropertiesTest(ut.TestCase): def setUp(self): self.__plwn = self._PLWNClass.from_reader(( - nd.make_synset_node(id=1, definition=u'foobar'), + nd.make_synset_node(id=1, legacy_id=1, definition=u'foobar'), nd.make_lexical_unit_node( id=11, + legacy_id=11, lemma=u'aaa', pos=en.PoS.n, variant=1, @@ -858,10 +859,10 @@ class ToDictTest(ut.TestCase): verb_aspect=en.VerbAspect.pred, emotion_markedness=en.EmotionMarkedness.strong_negative, emotion_names=(en.EmotionName.surprise,), - emotion_valuations=( - en.EmotionValuation.ugliness, + emotion_valuations=[ en.EmotionValuation.error, - ), + en.EmotionValuation.ugliness, + ], emotion_example_1=u'Bad thing.', ), nd.make_lexical_unit_node( @@ -886,7 +887,7 @@ class ToDictTest(ut.TestCase): self.__lex11_dict = { u'id': 11, u'lemma': u'aaa', - u'pos': u'verb', + u'pos': u'czasownik', u'variant': 1, u'synset': 1, u'definition': u'bar', diff --git a/tests/cases/test_ubylmf_reader.py b/tests/cases/test_ubylmf_reader.py index a8d7aba9944d7b84dab9e87c67291e7d6211ca1b..735c91c832890214572ba8aa2cd8ceadbdd2e7ef 100644 --- a/tests/cases/test_ubylmf_reader.py +++ b/tests/cases/test_ubylmf_reader.py @@ -28,7 +28,7 @@ test_xml = u"""<?xml version="1.0" encoding="UTF-8" ?> <LexicalResource dtdVersion="ubyDTD_1_0.dtd" name="plWordnet"> <Lexicon languageIdentifier="pl" id="1" name="Słowosieć 2.2"> -<LexicalEntry id="15" partOfSpeech="noun"> +<LexicalEntry id="15" partOfSpeech="rzeczownik"> <Lemma> <FormRepresentation writtenForm="'patafizyka"/> </Lemma> @@ -173,7 +173,7 @@ class UBYLMFReaderTest(unittest.TestCase): # Missing <Lemma> xml_lu = et.fromstring( u""" - <LexicalEntry id="15" partOfSpeech="noun"> + <LexicalEntry id="15" partOfSpeech="rzeczownik"> </LexicalEntry> """.encode(ENCODING) ) @@ -192,7 +192,7 @@ class UBYLMFReaderTest(unittest.TestCase): # Empty <Lemma> xml_lu = et.fromstring( u""" - <LexicalEntry id="15" partOfSpeech="noun"> + <LexicalEntry id="15" partOfSpeech="rzeczownik"> <Lemma> <FormRepresentation writtenForm=""/> </Lemma> @@ -227,7 +227,7 @@ class UBYLMFReaderTest(unittest.TestCase): # Incorrect unit index xml_lu = et.fromstring( u""" - <LexicalEntry id="15" partOfSpeech="noun"> + <LexicalEntry id="15" partOfSpeech="rzeczownik"> <Lemma> <FormRepresentation writtenForm="'patafizyka"/> </Lemma>