diff --git a/plwn/enums.py b/plwn/enums.py index 246fc3c8290958889f6d55fa5bf0d1098045bc2f..6dfcc70530dfb08d64189a99370872dacae0002e 100644 --- a/plwn/enums.py +++ b/plwn/enums.py @@ -104,10 +104,10 @@ class PoS(Enum): __order__ = 'verb noun adverb adjective ' \ 'verb_en noun_en adverb_en adjective_en' - verb = u'verb' - noun = u'noun' - adverb = u'adverb' - adjective = u'adjective' + verb = u'czasownik' + noun = u'rzeczownik' + adverb = u'przysłówek' + adjective = u'przymiotnik' # English (PWN) PoSes verb_en = u'verb_en' @@ -346,8 +346,8 @@ class Domain(Enum): sys = u'systematyka, klasyfikacja' - adj = u'PWN: all adjective clusters' - adv = u'PWN: all adverbs' + adj = u'all adjective clusters' + adv = u'all adverbs' mat = u'przymiotniki materiałowe' diff --git a/plwn/storages/sqlite.py b/plwn/storages/sqlite.py index 4d918a8444c918b70dd14436b73d4acca0b615af..54f5f329dc582cb504c646ab1cff36075dfc8aae 100644 --- a/plwn/storages/sqlite.py +++ b/plwn/storages/sqlite.py @@ -63,195 +63,195 @@ _DB_SCHEMA_SCRIPT = u""" PRAGMA foreign_keys = ON; -- Metadata table. Used for version number, currently -CREATE TABLE "tbl_plwn_meta" ( - "name" VARCHAR(255) NOT NULL , - "value" BLOB NULL +CREATE TABLE tbl_plwn_meta ( + name VARCHAR(255) NOT NULL , + value BLOB NULL ); -- Tables for constant values -CREATE TABLE "tbl_pos" ( - "id" INTEGER NOT NULL , - "value" VARCHAR(255) NOT NULL , - PRIMARY KEY ("id") +CREATE TABLE tbl_pos ( + id INTEGER NOT NULL , + value VARCHAR(255) NOT NULL , + PRIMARY KEY (id) ); -CREATE TABLE "tbl_verbaspect" ( - "id" INTEGER NOT NULL , - "value" VARCHAR(255) NOT NULL , - PRIMARY KEY ("id") +CREATE TABLE tbl_verbaspect ( + id INTEGER NOT NULL , + value VARCHAR(255) NOT NULL , + PRIMARY KEY (id) ); -CREATE TABLE "tbl_emotionmark" ( - "id" INTEGER NOT NULL , - "value" VARCHAR(255) NOT NULL , - PRIMARY KEY ("id") +CREATE TABLE tbl_emotionmark ( + id INTEGER NOT NULL , + value VARCHAR(255) NOT NULL , + PRIMARY KEY (id) ); -CREATE TABLE "tbl_emotionname" ( - "id" INTEGER NOT NULL , - "value" VARCHAR(255) NOT NULL , - PRIMARY KEY ("id") +CREATE TABLE tbl_emotionname ( + id INTEGER NOT NULL , + value VARCHAR(255) NOT NULL , + PRIMARY KEY (id) ); -CREATE TABLE "tbl_emotionvaluation" ( - "id" INTEGER NOT NULL , - "value" VARCHAR(255) NOT NULL , - PRIMARY KEY ("id") +CREATE TABLE tbl_emotionvaluation ( + id INTEGER NOT NULL , + value VARCHAR(255) NOT NULL , + PRIMARY KEY (id) ); -CREATE TABLE "tbl_domain" ( - "id" INTEGER NOT NULL , - "value" VARCHAR(255) NOT NULL , - PRIMARY KEY ("id") +CREATE TABLE tbl_domain ( + id INTEGER NOT NULL , + value VARCHAR(255) NOT NULL , + PRIMARY KEY (id) ); -CREATE UNIQUE INDEX "value" ON "tbl_domain" ("value"); +CREATE UNIQUE INDEX value ON tbl_domain (value); -- Synset only gets one simple table -CREATE TABLE "tbl_synset" ( - "id" BLOB NOT NULL , - "legacy_id" INTEGER NULL , - "definition" TEXT NULL , - "isartificial" INTEGER NOT NULL DEFAULT '0' , - PRIMARY KEY ("id") +CREATE TABLE tbl_synset ( + id BLOB NOT NULL , + legacy_id INTEGER NULL , + definition TEXT NULL , + isartificial INTEGER NOT NULL DEFAULT '0' , + PRIMARY KEY (id) ); -- Lexical units have several tables, since they have several list-like -- properties. They also need indexes for lookup. -CREATE TABLE "tbl_lexicalunit" ( - "id" BLOB NOT NULL , - "legacy_id" INTEGER NULL , - "lemma" VARCHAR(255) NOT NULL , - "pos" INTEGER NOT NULL , - "variant" INTEGER NOT NULL , - "synset" BLOB NOT NULL , - "unitindex" INTEGER NOT NULL , - "definition" TEXT NULL , - "domain" INTEGER NOT NULL , - "verbaspect" INTEGER NULL , - "isemotional" INTEGER NULL , - "emotionmark" INTEGER NULL , - "emotionexample1" TEXT NULL , - "emotionexample2" TEXT NULL , - PRIMARY KEY ("id") +CREATE TABLE tbl_lexicalunit ( + id BLOB NOT NULL , + legacy_id INTEGER NULL , + lemma VARCHAR(255) NOT NULL , + pos INTEGER NOT NULL , + variant INTEGER NOT NULL , + synset BLOB NOT NULL , + unitindex INTEGER NOT NULL , + definition TEXT NULL , + domain INTEGER NOT NULL , + verbaspect INTEGER NULL , + isemotional INTEGER NULL , + emotionmark INTEGER NULL , + emotionexample1 TEXT NULL , + emotionexample2 TEXT NULL , + PRIMARY KEY (id) ); -CREATE UNIQUE INDEX "lemma" ON "tbl_lexicalunit" ("lemma", "pos", "variant"); -CREATE INDEX "lex_i_lem_var" ON "tbl_lexicalunit" ("lemma", "variant"); -CREATE INDEX "lex_i_pos" ON "tbl_lexicalunit" ("pos"); -CREATE UNIQUE INDEX "synset" ON "tbl_lexicalunit" ("synset", "unitindex"); +CREATE UNIQUE INDEX lemma ON tbl_lexicalunit (lemma, pos, variant); +CREATE INDEX lex_i_lem_var ON tbl_lexicalunit (lemma, variant); +CREATE INDEX lex_i_pos ON tbl_lexicalunit (pos); +CREATE UNIQUE INDEX synset ON tbl_lexicalunit (synset, unitindex); -- Tables dependant on lexicalunit -CREATE TABLE "tbl_senseexample" ( - "unitid" BLOB NOT NULL , - "example" TEXT NOT NULL , - "source" TEXT NOT NULL +CREATE TABLE tbl_senseexample ( + unitid BLOB NOT NULL , + example TEXT NOT NULL , + source TEXT NOT NULL ); -CREATE INDEX "sen_i" ON "tbl_senseexample" ("unitid"); +CREATE INDEX sen_i ON tbl_senseexample (unitid); -CREATE TABLE "tbl_externallink" ( - "unitid" BLOB NOT NULL , - "link" TEXT NOT NULL +CREATE TABLE tbl_externallink ( + unitid BLOB NOT NULL , + link TEXT NOT NULL ); -CREATE INDEX "link_i" ON "tbl_externallink" ("unitid"); +CREATE INDEX link_i ON tbl_externallink (unitid); -CREATE TABLE "tbl_usagenote" ( - "unitid" BLOB NOT NULL , - "note" TEXT NOT NULL +CREATE TABLE tbl_usagenote ( + unitid BLOB NOT NULL , + note TEXT NOT NULL ); -CREATE INDEX "note_i" ON "tbl_usagenote" ("unitid"); +CREATE INDEX note_i ON tbl_usagenote (unitid); -CREATE TABLE "tbl_unitemotionname" ( - "unitid" BLOB NOT NULL , - "nameid" INTEGER NOT NULL , - PRIMARY KEY ("unitid", "nameid") +CREATE TABLE tbl_unitemotionname ( + unitid BLOB NOT NULL , + nameid INTEGER NOT NULL , + PRIMARY KEY (unitid, nameid) ); -CREATE TABLE "tbl_unitemotionvaluation" ( - "unitid" BLOB NOT NULL , - "valuationid" INTEGER NOT NULL , - PRIMARY KEY ("unitid", "valuationid") +CREATE TABLE tbl_unitemotionvaluation ( + unitid BLOB NOT NULL , + valuationid INTEGER NOT NULL , + PRIMARY KEY (unitid, valuationid) ); -- Relation tables -- -- The for below are used to gather combinations of parent / child relation -- names. -CREATE TABLE "tbl_synsetrelationparentpart" ( - "id" INTEGER NOT NULL , - "name" VARCHAR(255) NOT NULL , - PRIMARY KEY ("id") +CREATE TABLE tbl_synsetrelationparentpart ( + id INTEGER NOT NULL , + name VARCHAR(255) NOT NULL , + PRIMARY KEY (id) ); -CREATE TABLE "tbl_synsetrelationchildpart" ( - "id" INTEGER NOT NULL , - "name" VARCHAR(255) NOT NULL , - PRIMARY KEY ("id") +CREATE TABLE tbl_synsetrelationchildpart ( + id INTEGER NOT NULL , + name VARCHAR(255) NOT NULL , + PRIMARY KEY (id) ); -CREATE TABLE "tbl_lexicalrelationparentpart" ( - "id" INTEGER NOT NULL , - "name" VARCHAR(255) NOT NULL , - PRIMARY KEY ("id") +CREATE TABLE tbl_lexicalrelationparentpart ( + id INTEGER NOT NULL , + name VARCHAR(255) NOT NULL , + PRIMARY KEY (id) ); -CREATE TABLE "tbl_lexicalrelationchildpart" ( - "id" INTEGER NOT NULL , - "name" VARCHAR(255) NOT NULL , - PRIMARY KEY ("id") +CREATE TABLE tbl_lexicalrelationchildpart ( + id INTEGER NOT NULL , + name VARCHAR(255) NOT NULL , + PRIMARY KEY (id) ); -CREATE UNIQUE INDEX "name" ON "tbl_lexicalrelationchildpart" ("name"); -CREATE UNIQUE INDEX "parentpart" ON "tbl_lexicalrelationtype" ( - "parentpart", - "childpart" +CREATE UNIQUE INDEX name ON tbl_lexicalrelationchildpart (name); +CREATE UNIQUE INDEX parentpart ON tbl_lexicalrelationtype ( + parentpart, + childpart ); -- Next, gather these parts into relation types themselves. -- Parent can't be NULL - the no-parent case will be handled by a special empty -- string parent. This is so that UNIQUE works correctly. -CREATE TABLE "tbl_synsetrelationtype" ( - "id" BLOB NOT NULL , - "legacy_id" INTEGER NULL , - "parentpart" INTEGER NOT NULL , - "childpart" INTEGER NOT NULL , - PRIMARY KEY ("id") +CREATE TABLE tbl_synsetrelationtype ( + id BLOB NOT NULL , + legacy_id INTEGER NULL , + parentpart INTEGER NOT NULL , + childpart INTEGER NOT NULL , + PRIMARY KEY (id) ); -CREATE TABLE "tbl_lexicalrelationtype" ( - "id" BLOB NOT NULL , - "legacy_id" INTEGER NULL , - "parentpart" INTEGER NOT NULL , - "childpart" INTEGER NOT NULL , - PRIMARY KEY ("id") +CREATE TABLE tbl_lexicalrelationtype ( + id BLOB NOT NULL , + legacy_id INTEGER NULL , + parentpart INTEGER NOT NULL , + childpart INTEGER NOT NULL , + PRIMARY KEY (id) ); -- The below tables are simply maps of relation aliases to their main IDs. -- Reverse indexes are needed, too. -CREATE TABLE "tbl_synsetrelationalias" ( - "name" VARCHAR(255) NOT NULL , - "relationid" BLOB NOT NULL , - PRIMARY KEY ("name") +CREATE TABLE tbl_synsetrelationalias ( + name VARCHAR(255) NOT NULL , + relationid BLOB NOT NULL , + PRIMARY KEY (name) ); -CREATE INDEX "synsetrelationalias_irev" ON "tbl_synsetrelationalias" ( - "relationid" +CREATE INDEX synsetrelationalias_irev ON tbl_synsetrelationalias ( + relationid ); -CREATE TABLE "tbl_lexicalrelationalias" ( - "name" VARCHAR(255) NOT NULL , - "relationid" BLOB NOT NULL , - PRIMARY KEY ("name") +CREATE TABLE tbl_lexicalrelationalias ( + name VARCHAR(255) NOT NULL , + relationid BLOB NOT NULL , + PRIMARY KEY (name) ); -CREATE INDEX "lexicalrelationalias_irev" ON "tbl_lexicalrelationalias" ( - "relationid" +CREATE INDEX lexicalrelationalias_irev ON tbl_lexicalrelationalias ( + relationid ); -- Next are finally the relation instances -CREATE TABLE "tbl_synsetrelation" ( - "source" BLOB NOT NULL , - "relationtype" BLOB NOT NULL , - "target" BLOB NOT NULL , - PRIMARY KEY ("source", "relationtype", "target") +CREATE TABLE tbl_synsetrelation ( + source BLOB NOT NULL , + relationtype BLOB NOT NULL , + target BLOB NOT NULL , + PRIMARY KEY (source, relationtype, target) ); -CREATE TABLE "tbl_lexicalrelation" ( - "source" BLOB NOT NULL , - "relationtype" BLOB NOT NULL , - "target" BLOB NOT NULL , - PRIMARY KEY ("source", "relationtype", "target") +CREATE TABLE tbl_lexicalrelation ( + source BLOB NOT NULL , + relationtype BLOB NOT NULL , + target BLOB NOT NULL , + PRIMARY KEY (source, relationtype, target) ); -- Insert the special empty values for the parent part tables @@ -284,7 +284,7 @@ _RELINST_TABLES = { class PLWordNet(bs.PLWordNetBase): _STORAGE_NAME = 'sqlite3' - _SCHEMA_VERSION = 4 + _SCHEMA_VERSION = '4' @classmethod def from_reader(cls, reader, dump_to=None): @@ -366,8 +366,8 @@ class PLWordNet(bs.PLWordNetBase): with closing(self._db.cursor()) as cur: cur.execute( u""" - SELECT lemma, pos.value, variant, synset - FROM tbl_lexicalunit + SELECT lemma, tbl_pos.value, variant, synset + FROM tbl_lexicalunit JOIN tbl_pos ON tbl_lexicalunit.pos = tbl_pos.id WHERE tbl_lexicalunit.id = ? """, @@ -546,7 +546,8 @@ class PLWordNet(bs.PLWordNetBase): with closing(self._db.cursor()) as cur: cur.execute( u""" - SELECT tbl_lexicalunit.id, lemma, pos.value, variant, synset + SELECT tbl_lexicalunit.id, lemma, tbl_pos.value, + variant, synset FROM tbl_lexicalunit JOIN tbl_pos ON tbl_lexicalunit.pos = tbl_pos.id WHERE COALESCE(lemma = :lem, :defval) @@ -599,7 +600,10 @@ class PLWordNet(bs.PLWordNetBase): ((en.value,) for en in en.EmotionName), ).close() self._db.executemany( - u"INSERT OR IGNORE INTO tbl_emotionvaluation (value) VALUES (?)", + u""" + INSERT OR IGNORE INTO tbl_emotionvaluation (value) + VALUES (?) + """, ((ev.value,) for ev in en.EmotionValuation), ).close() self._db.executemany( @@ -678,6 +682,7 @@ class LexicalUnit(bs.LexicalUnitBase): self._var = variant self._synid = synid # Rest is unitialized + self._leg_id = _UNFETCHED self._syn = _UNFETCHED self._def = _UNFETCHED self._usn = _UNFETCHED @@ -717,6 +722,17 @@ class LexicalUnit(bs.LexicalUnitBase): def is_english(self): return self._pos.is_english + @property + def legacy_id(self): + if self._leg_id is _UNFETCHED: + with closing(self._db.cursor()) as cur: + cur.execute( + u"SELECT legacy_id FROM tbl_lexicalunit WHERE id = ?", + (self._id,), + ) + self._leg_id = cur.fetchone()[0] + return self._leg_id + @property def synset(self): if self._syn is _UNFETCHED or self._syn() is None: @@ -934,7 +950,8 @@ class LexicalUnit(bs.LexicalUnitBase): with closing(self._db.cursor()) as cur: cur.execute( u""" - SELECT tbl_lexicalunit.id, lemma, tbl_pos.value, variant, synset + SELECT tbl_lexicalunit.id, + lemma, tbl_pos.value, variant, synset FROM tbl_lexicalrelation JOIN tbl_lexicalunit ON tbl_lexicalunit.id = target JOIN tbl_pos ON tbl_lexicalunit.pos = tbl_pos.id @@ -997,6 +1014,7 @@ class Synset(bs.SynsetBase): self._units = _UNFETCHED self._def = _UNFETCHED + self._leg_id = _UNFETCHED self._pos = _UNFETCHED self._is_polish = _UNFETCHED @@ -1033,7 +1051,8 @@ class Synset(bs.SynsetBase): cur.execute( u""" SELECT tbl_lexicalunit.id, lemma, tbl_pos.value, variant - FROM tbl_lexicalunit JOIN tbl_pos ON tbl_lexicalunit.pos = tbl_pos.id + FROM tbl_lexicalunit + JOIN tbl_pos ON tbl_lexicalunit.pos = tbl_pos.id WHERE synset = ? ORDER BY unitindex """, @@ -1054,6 +1073,17 @@ class Synset(bs.SynsetBase): assert self._units return self._units + @property + def legacy_id(self): + if self._leg_id is _UNFETCHED: + with closing(self._db.cursor()) as cur: + cur.execute( + u"SELECT legacy_id FROM tbl_synset WHERE id = ?", + (self._id,), + ) + self._leg_id = cur.fetchone()[0] + return self._leg_id + @property def definition(self): if self._def is _UNFETCHED: @@ -1313,18 +1343,18 @@ class _DBBuilder(object): # Synset to lexical units relations also need to be deferred. self._synid2lexids = coll.defaultdict(list) # Cache IDs of constant values - with closing(db.execute(u"SELECT value, id FROM tbl_pos")) as cur: + with closing(db.execute(u"SELECT value, id FROM pos")) as cur: self._posids = dict(cur) - with closing(db.execute(u"SELECT value, id FROM tbl_verbaspect")) as cur: + with closing(db.execute(u"SELECT value, id FROM verbaspect")) as cur: self._vaids = dict(cur) - with closing(db.execute(u"SELECT value, id FROM tbl_emotionmark")) as cur: + with closing(db.execute(u"SELECT value, id FROM emotionmark")) as cur: self._emids = dict(cur) - with closing(db.execute(u"SELECT value, id FROM tbl_emotionname")) as cur: + with closing(db.execute(u"SELECT value, id FROM emotionname")) as cur: self._enids = dict(cur) - with closing(db.execute(u"SELECT value, id FROM tbl_emotionvaluation")) \ + with closing(db.execute(u"SELECT value, id FROM emotionvaluation")) \ as cur: self._evids = dict(cur) - with closing(db.execute(u"SELECT value, id FROM tbl_domain")) as cur: + with closing(db.execute(u"SELECT value, id FROM domain")) as cur: self._dmids = dict(cur) def __call__(self, reader): @@ -1351,7 +1381,7 @@ class _DBBuilder(object): def _insert_synset(self, syn_node): self._db.execute( u""" - INSERT INTO tbl_synset (id, definition, isartificial) + INSERT INTO synset (id, definition, isartificial) VALUES (?, ?, ?) """, (syn_node.id, syn_node.definition, syn_node.is_artificial), @@ -1402,7 +1432,7 @@ class _DBBuilder(object): try: cur.execute( u""" - INSERT INTO tbl_lexicalunit ( + INSERT INTO lexicalunit ( id, lemma, pos, variant, synset, unitindex, definition, domain, verbaspect, @@ -1453,7 +1483,7 @@ class _DBBuilder(object): cur.executemany( u""" - INSERT INTO tbl_senseexample (unitid, example, source) + INSERT INTO senseexample (unitid, example, source) VALUES (?, ?, ?) """, ( @@ -1464,14 +1494,14 @@ class _DBBuilder(object): ) cur.executemany( u""" - INSERT INTO tbl_usagenote (unitid, note) + INSERT INTO usagenote (unitid, note) VALUES (?, ?) """, ((lu_node.id, note) for note in lu_node.usage_notes), ) cur.executemany( u""" - INSERT INTO tbl_externallink (unitid, link) + INSERT INTO externallink (unitid, link) VALUES (?, ?) """, ((lu_node.id, link) @@ -1479,7 +1509,7 @@ class _DBBuilder(object): ) cur.executemany( u""" - INSERT INTO tbl_unitemotionname (unitid, nameid) + INSERT INTO unitemotionname (unitid, nameid) VALUES (?, ?) """, ( @@ -1489,7 +1519,7 @@ class _DBBuilder(object): ) cur.executemany( u""" - INSERT INTO tbl_unitemotionvaluation (unitid, valuationid) + INSERT INTO unitemotionvaluation (unitid, valuationid) VALUES (?, ?) """, ( @@ -1626,10 +1656,10 @@ class _DBBuilder(object): with closing(self._db.cursor()) as cur: cur.execute( u""" - SELECT tbl_synset.id - FROM tbl_synset - LEFT JOIN tbl_lexicalunit ON tbl_synset.id = tbl_lexicalunit.synset - WHERE tbl_lexicalunit.synset IS NULL + SELECT synset.id + FROM synset + LEFT JOIN lexicalunit ON synset.id = lexicalunit.synset + WHERE lexicalunit.synset IS NULL """, ) empties = tuple(row[0] for row in cur) @@ -1642,7 +1672,7 @@ class _DBBuilder(object): _LOG.warning('Synset %d is empty', synid) self._db.execute( - u"DELETE FROM tbl_synset WHERE id IN ({})".format( + u"DELETE FROM synset WHERE id IN ({})".format( u','.join(u'?' * len(empties)) ), empties,