diff --git a/plwn-api_plwn_dump_new_07-12-2022.sqlite b/plwn-api_plwn_dump_new_07-12-2022.sqlite deleted file mode 100644 index 19fe03e3c05a58941aea3e5207be305382858d67..0000000000000000000000000000000000000000 Binary files a/plwn-api_plwn_dump_new_07-12-2022.sqlite and /dev/null differ diff --git a/plwn/enums.py b/plwn/enums.py index 6dfcc70530dfb08d64189a99370872dacae0002e..246fc3c8290958889f6d55fa5bf0d1098045bc2f 100644 --- a/plwn/enums.py +++ b/plwn/enums.py @@ -104,10 +104,10 @@ class PoS(Enum): __order__ = 'verb noun adverb adjective ' \ 'verb_en noun_en adverb_en adjective_en' - verb = u'czasownik' - noun = u'rzeczownik' - adverb = u'przysłówek' - adjective = u'przymiotnik' + verb = u'verb' + noun = u'noun' + adverb = u'adverb' + adjective = u'adjective' # English (PWN) PoSes verb_en = u'verb_en' @@ -346,8 +346,8 @@ class Domain(Enum): sys = u'systematyka, klasyfikacja' - adj = u'all adjective clusters' - adv = u'all adverbs' + adj = u'PWN: all adjective clusters' + adv = u'PWN: all adverbs' mat = u'przymiotniki materiałowe' diff --git a/plwn/storages/sqlite.py b/plwn/storages/sqlite.py index 54f5f329dc582cb504c646ab1cff36075dfc8aae..4d918a8444c918b70dd14436b73d4acca0b615af 100644 --- a/plwn/storages/sqlite.py +++ b/plwn/storages/sqlite.py @@ -63,195 +63,195 @@ _DB_SCHEMA_SCRIPT = u""" PRAGMA foreign_keys = ON; -- Metadata table. Used for version number, currently -CREATE TABLE tbl_plwn_meta ( - name VARCHAR(255) NOT NULL , - value BLOB NULL +CREATE TABLE "tbl_plwn_meta" ( + "name" VARCHAR(255) NOT NULL , + "value" BLOB NULL ); -- Tables for constant values -CREATE TABLE tbl_pos ( - id INTEGER NOT NULL , - value VARCHAR(255) NOT NULL , - PRIMARY KEY (id) +CREATE TABLE "tbl_pos" ( + "id" INTEGER NOT NULL , + "value" VARCHAR(255) NOT NULL , + PRIMARY KEY ("id") ); -CREATE TABLE tbl_verbaspect ( - id INTEGER NOT NULL , - value VARCHAR(255) NOT NULL , - PRIMARY KEY (id) +CREATE TABLE "tbl_verbaspect" ( + "id" INTEGER NOT NULL , + "value" VARCHAR(255) NOT NULL , + PRIMARY KEY ("id") ); -CREATE TABLE tbl_emotionmark ( - id INTEGER NOT NULL , - value VARCHAR(255) NOT NULL , - PRIMARY KEY (id) +CREATE TABLE "tbl_emotionmark" ( + "id" INTEGER NOT NULL , + "value" VARCHAR(255) NOT NULL , + PRIMARY KEY ("id") ); -CREATE TABLE tbl_emotionname ( - id INTEGER NOT NULL , - value VARCHAR(255) NOT NULL , - PRIMARY KEY (id) +CREATE TABLE "tbl_emotionname" ( + "id" INTEGER NOT NULL , + "value" VARCHAR(255) NOT NULL , + PRIMARY KEY ("id") ); -CREATE TABLE tbl_emotionvaluation ( - id INTEGER NOT NULL , - value VARCHAR(255) NOT NULL , - PRIMARY KEY (id) +CREATE TABLE "tbl_emotionvaluation" ( + "id" INTEGER NOT NULL , + "value" VARCHAR(255) NOT NULL , + PRIMARY KEY ("id") ); -CREATE TABLE tbl_domain ( - id INTEGER NOT NULL , - value VARCHAR(255) NOT NULL , - PRIMARY KEY (id) +CREATE TABLE "tbl_domain" ( + "id" INTEGER NOT NULL , + "value" VARCHAR(255) NOT NULL , + PRIMARY KEY ("id") ); -CREATE UNIQUE INDEX value ON tbl_domain (value); +CREATE UNIQUE INDEX "value" ON "tbl_domain" ("value"); -- Synset only gets one simple table -CREATE TABLE tbl_synset ( - id BLOB NOT NULL , - legacy_id INTEGER NULL , - definition TEXT NULL , - isartificial INTEGER NOT NULL DEFAULT '0' , - PRIMARY KEY (id) +CREATE TABLE "tbl_synset" ( + "id" BLOB NOT NULL , + "legacy_id" INTEGER NULL , + "definition" TEXT NULL , + "isartificial" INTEGER NOT NULL DEFAULT '0' , + PRIMARY KEY ("id") ); -- Lexical units have several tables, since they have several list-like -- properties. They also need indexes for lookup. -CREATE TABLE tbl_lexicalunit ( - id BLOB NOT NULL , - legacy_id INTEGER NULL , - lemma VARCHAR(255) NOT NULL , - pos INTEGER NOT NULL , - variant INTEGER NOT NULL , - synset BLOB NOT NULL , - unitindex INTEGER NOT NULL , - definition TEXT NULL , - domain INTEGER NOT NULL , - verbaspect INTEGER NULL , - isemotional INTEGER NULL , - emotionmark INTEGER NULL , - emotionexample1 TEXT NULL , - emotionexample2 TEXT NULL , - PRIMARY KEY (id) +CREATE TABLE "tbl_lexicalunit" ( + "id" BLOB NOT NULL , + "legacy_id" INTEGER NULL , + "lemma" VARCHAR(255) NOT NULL , + "pos" INTEGER NOT NULL , + "variant" INTEGER NOT NULL , + "synset" BLOB NOT NULL , + "unitindex" INTEGER NOT NULL , + "definition" TEXT NULL , + "domain" INTEGER NOT NULL , + "verbaspect" INTEGER NULL , + "isemotional" INTEGER NULL , + "emotionmark" INTEGER NULL , + "emotionexample1" TEXT NULL , + "emotionexample2" TEXT NULL , + PRIMARY KEY ("id") ); -CREATE UNIQUE INDEX lemma ON tbl_lexicalunit (lemma, pos, variant); -CREATE INDEX lex_i_lem_var ON tbl_lexicalunit (lemma, variant); -CREATE INDEX lex_i_pos ON tbl_lexicalunit (pos); -CREATE UNIQUE INDEX synset ON tbl_lexicalunit (synset, unitindex); +CREATE UNIQUE INDEX "lemma" ON "tbl_lexicalunit" ("lemma", "pos", "variant"); +CREATE INDEX "lex_i_lem_var" ON "tbl_lexicalunit" ("lemma", "variant"); +CREATE INDEX "lex_i_pos" ON "tbl_lexicalunit" ("pos"); +CREATE UNIQUE INDEX "synset" ON "tbl_lexicalunit" ("synset", "unitindex"); -- Tables dependant on lexicalunit -CREATE TABLE tbl_senseexample ( - unitid BLOB NOT NULL , - example TEXT NOT NULL , - source TEXT NOT NULL +CREATE TABLE "tbl_senseexample" ( + "unitid" BLOB NOT NULL , + "example" TEXT NOT NULL , + "source" TEXT NOT NULL ); -CREATE INDEX sen_i ON tbl_senseexample (unitid); +CREATE INDEX "sen_i" ON "tbl_senseexample" ("unitid"); -CREATE TABLE tbl_externallink ( - unitid BLOB NOT NULL , - link TEXT NOT NULL +CREATE TABLE "tbl_externallink" ( + "unitid" BLOB NOT NULL , + "link" TEXT NOT NULL ); -CREATE INDEX link_i ON tbl_externallink (unitid); +CREATE INDEX "link_i" ON "tbl_externallink" ("unitid"); -CREATE TABLE tbl_usagenote ( - unitid BLOB NOT NULL , - note TEXT NOT NULL +CREATE TABLE "tbl_usagenote" ( + "unitid" BLOB NOT NULL , + "note" TEXT NOT NULL ); -CREATE INDEX note_i ON tbl_usagenote (unitid); +CREATE INDEX "note_i" ON "tbl_usagenote" ("unitid"); -CREATE TABLE tbl_unitemotionname ( - unitid BLOB NOT NULL , - nameid INTEGER NOT NULL , - PRIMARY KEY (unitid, nameid) +CREATE TABLE "tbl_unitemotionname" ( + "unitid" BLOB NOT NULL , + "nameid" INTEGER NOT NULL , + PRIMARY KEY ("unitid", "nameid") ); -CREATE TABLE tbl_unitemotionvaluation ( - unitid BLOB NOT NULL , - valuationid INTEGER NOT NULL , - PRIMARY KEY (unitid, valuationid) +CREATE TABLE "tbl_unitemotionvaluation" ( + "unitid" BLOB NOT NULL , + "valuationid" INTEGER NOT NULL , + PRIMARY KEY ("unitid", "valuationid") ); -- Relation tables -- -- The for below are used to gather combinations of parent / child relation -- names. -CREATE TABLE tbl_synsetrelationparentpart ( - id INTEGER NOT NULL , - name VARCHAR(255) NOT NULL , - PRIMARY KEY (id) +CREATE TABLE "tbl_synsetrelationparentpart" ( + "id" INTEGER NOT NULL , + "name" VARCHAR(255) NOT NULL , + PRIMARY KEY ("id") ); -CREATE TABLE tbl_synsetrelationchildpart ( - id INTEGER NOT NULL , - name VARCHAR(255) NOT NULL , - PRIMARY KEY (id) +CREATE TABLE "tbl_synsetrelationchildpart" ( + "id" INTEGER NOT NULL , + "name" VARCHAR(255) NOT NULL , + PRIMARY KEY ("id") ); -CREATE TABLE tbl_lexicalrelationparentpart ( - id INTEGER NOT NULL , - name VARCHAR(255) NOT NULL , - PRIMARY KEY (id) +CREATE TABLE "tbl_lexicalrelationparentpart" ( + "id" INTEGER NOT NULL , + "name" VARCHAR(255) NOT NULL , + PRIMARY KEY ("id") ); -CREATE TABLE tbl_lexicalrelationchildpart ( - id INTEGER NOT NULL , - name VARCHAR(255) NOT NULL , - PRIMARY KEY (id) +CREATE TABLE "tbl_lexicalrelationchildpart" ( + "id" INTEGER NOT NULL , + "name" VARCHAR(255) NOT NULL , + PRIMARY KEY ("id") ); -CREATE UNIQUE INDEX name ON tbl_lexicalrelationchildpart (name); -CREATE UNIQUE INDEX parentpart ON tbl_lexicalrelationtype ( - parentpart, - childpart +CREATE UNIQUE INDEX "name" ON "tbl_lexicalrelationchildpart" ("name"); +CREATE UNIQUE INDEX "parentpart" ON "tbl_lexicalrelationtype" ( + "parentpart", + "childpart" ); -- Next, gather these parts into relation types themselves. -- Parent can't be NULL - the no-parent case will be handled by a special empty -- string parent. This is so that UNIQUE works correctly. -CREATE TABLE tbl_synsetrelationtype ( - id BLOB NOT NULL , - legacy_id INTEGER NULL , - parentpart INTEGER NOT NULL , - childpart INTEGER NOT NULL , - PRIMARY KEY (id) +CREATE TABLE "tbl_synsetrelationtype" ( + "id" BLOB NOT NULL , + "legacy_id" INTEGER NULL , + "parentpart" INTEGER NOT NULL , + "childpart" INTEGER NOT NULL , + PRIMARY KEY ("id") ); -CREATE TABLE tbl_lexicalrelationtype ( - id BLOB NOT NULL , - legacy_id INTEGER NULL , - parentpart INTEGER NOT NULL , - childpart INTEGER NOT NULL , - PRIMARY KEY (id) +CREATE TABLE "tbl_lexicalrelationtype" ( + "id" BLOB NOT NULL , + "legacy_id" INTEGER NULL , + "parentpart" INTEGER NOT NULL , + "childpart" INTEGER NOT NULL , + PRIMARY KEY ("id") ); -- The below tables are simply maps of relation aliases to their main IDs. -- Reverse indexes are needed, too. -CREATE TABLE tbl_synsetrelationalias ( - name VARCHAR(255) NOT NULL , - relationid BLOB NOT NULL , - PRIMARY KEY (name) +CREATE TABLE "tbl_synsetrelationalias" ( + "name" VARCHAR(255) NOT NULL , + "relationid" BLOB NOT NULL , + PRIMARY KEY ("name") ); -CREATE INDEX synsetrelationalias_irev ON tbl_synsetrelationalias ( - relationid +CREATE INDEX "synsetrelationalias_irev" ON "tbl_synsetrelationalias" ( + "relationid" ); -CREATE TABLE tbl_lexicalrelationalias ( - name VARCHAR(255) NOT NULL , - relationid BLOB NOT NULL , - PRIMARY KEY (name) +CREATE TABLE "tbl_lexicalrelationalias" ( + "name" VARCHAR(255) NOT NULL , + "relationid" BLOB NOT NULL , + PRIMARY KEY ("name") ); -CREATE INDEX lexicalrelationalias_irev ON tbl_lexicalrelationalias ( - relationid +CREATE INDEX "lexicalrelationalias_irev" ON "tbl_lexicalrelationalias" ( + "relationid" ); -- Next are finally the relation instances -CREATE TABLE tbl_synsetrelation ( - source BLOB NOT NULL , - relationtype BLOB NOT NULL , - target BLOB NOT NULL , - PRIMARY KEY (source, relationtype, target) +CREATE TABLE "tbl_synsetrelation" ( + "source" BLOB NOT NULL , + "relationtype" BLOB NOT NULL , + "target" BLOB NOT NULL , + PRIMARY KEY ("source", "relationtype", "target") ); -CREATE TABLE tbl_lexicalrelation ( - source BLOB NOT NULL , - relationtype BLOB NOT NULL , - target BLOB NOT NULL , - PRIMARY KEY (source, relationtype, target) +CREATE TABLE "tbl_lexicalrelation" ( + "source" BLOB NOT NULL , + "relationtype" BLOB NOT NULL , + "target" BLOB NOT NULL , + PRIMARY KEY ("source", "relationtype", "target") ); -- Insert the special empty values for the parent part tables @@ -284,7 +284,7 @@ _RELINST_TABLES = { class PLWordNet(bs.PLWordNetBase): _STORAGE_NAME = 'sqlite3' - _SCHEMA_VERSION = '4' + _SCHEMA_VERSION = 4 @classmethod def from_reader(cls, reader, dump_to=None): @@ -366,8 +366,8 @@ class PLWordNet(bs.PLWordNetBase): with closing(self._db.cursor()) as cur: cur.execute( u""" - SELECT lemma, tbl_pos.value, variant, synset - FROM tbl_lexicalunit + SELECT lemma, pos.value, variant, synset + FROM tbl_lexicalunit JOIN tbl_pos ON tbl_lexicalunit.pos = tbl_pos.id WHERE tbl_lexicalunit.id = ? """, @@ -546,8 +546,7 @@ class PLWordNet(bs.PLWordNetBase): with closing(self._db.cursor()) as cur: cur.execute( u""" - SELECT tbl_lexicalunit.id, lemma, tbl_pos.value, - variant, synset + SELECT tbl_lexicalunit.id, lemma, pos.value, variant, synset FROM tbl_lexicalunit JOIN tbl_pos ON tbl_lexicalunit.pos = tbl_pos.id WHERE COALESCE(lemma = :lem, :defval) @@ -600,10 +599,7 @@ class PLWordNet(bs.PLWordNetBase): ((en.value,) for en in en.EmotionName), ).close() self._db.executemany( - u""" - INSERT OR IGNORE INTO tbl_emotionvaluation (value) - VALUES (?) - """, + u"INSERT OR IGNORE INTO tbl_emotionvaluation (value) VALUES (?)", ((ev.value,) for ev in en.EmotionValuation), ).close() self._db.executemany( @@ -682,7 +678,6 @@ class LexicalUnit(bs.LexicalUnitBase): self._var = variant self._synid = synid # Rest is unitialized - self._leg_id = _UNFETCHED self._syn = _UNFETCHED self._def = _UNFETCHED self._usn = _UNFETCHED @@ -722,17 +717,6 @@ class LexicalUnit(bs.LexicalUnitBase): def is_english(self): return self._pos.is_english - @property - def legacy_id(self): - if self._leg_id is _UNFETCHED: - with closing(self._db.cursor()) as cur: - cur.execute( - u"SELECT legacy_id FROM tbl_lexicalunit WHERE id = ?", - (self._id,), - ) - self._leg_id = cur.fetchone()[0] - return self._leg_id - @property def synset(self): if self._syn is _UNFETCHED or self._syn() is None: @@ -950,8 +934,7 @@ class LexicalUnit(bs.LexicalUnitBase): with closing(self._db.cursor()) as cur: cur.execute( u""" - SELECT tbl_lexicalunit.id, - lemma, tbl_pos.value, variant, synset + SELECT tbl_lexicalunit.id, lemma, tbl_pos.value, variant, synset FROM tbl_lexicalrelation JOIN tbl_lexicalunit ON tbl_lexicalunit.id = target JOIN tbl_pos ON tbl_lexicalunit.pos = tbl_pos.id @@ -1014,7 +997,6 @@ class Synset(bs.SynsetBase): self._units = _UNFETCHED self._def = _UNFETCHED - self._leg_id = _UNFETCHED self._pos = _UNFETCHED self._is_polish = _UNFETCHED @@ -1051,8 +1033,7 @@ class Synset(bs.SynsetBase): cur.execute( u""" SELECT tbl_lexicalunit.id, lemma, tbl_pos.value, variant - FROM tbl_lexicalunit - JOIN tbl_pos ON tbl_lexicalunit.pos = tbl_pos.id + FROM tbl_lexicalunit JOIN tbl_pos ON tbl_lexicalunit.pos = tbl_pos.id WHERE synset = ? ORDER BY unitindex """, @@ -1073,17 +1054,6 @@ class Synset(bs.SynsetBase): assert self._units return self._units - @property - def legacy_id(self): - if self._leg_id is _UNFETCHED: - with closing(self._db.cursor()) as cur: - cur.execute( - u"SELECT legacy_id FROM tbl_synset WHERE id = ?", - (self._id,), - ) - self._leg_id = cur.fetchone()[0] - return self._leg_id - @property def definition(self): if self._def is _UNFETCHED: @@ -1343,18 +1313,18 @@ class _DBBuilder(object): # Synset to lexical units relations also need to be deferred. self._synid2lexids = coll.defaultdict(list) # Cache IDs of constant values - with closing(db.execute(u"SELECT value, id FROM pos")) as cur: + with closing(db.execute(u"SELECT value, id FROM tbl_pos")) as cur: self._posids = dict(cur) - with closing(db.execute(u"SELECT value, id FROM verbaspect")) as cur: + with closing(db.execute(u"SELECT value, id FROM tbl_verbaspect")) as cur: self._vaids = dict(cur) - with closing(db.execute(u"SELECT value, id FROM emotionmark")) as cur: + with closing(db.execute(u"SELECT value, id FROM tbl_emotionmark")) as cur: self._emids = dict(cur) - with closing(db.execute(u"SELECT value, id FROM emotionname")) as cur: + with closing(db.execute(u"SELECT value, id FROM tbl_emotionname")) as cur: self._enids = dict(cur) - with closing(db.execute(u"SELECT value, id FROM emotionvaluation")) \ + with closing(db.execute(u"SELECT value, id FROM tbl_emotionvaluation")) \ as cur: self._evids = dict(cur) - with closing(db.execute(u"SELECT value, id FROM domain")) as cur: + with closing(db.execute(u"SELECT value, id FROM tbl_domain")) as cur: self._dmids = dict(cur) def __call__(self, reader): @@ -1381,7 +1351,7 @@ class _DBBuilder(object): def _insert_synset(self, syn_node): self._db.execute( u""" - INSERT INTO synset (id, definition, isartificial) + INSERT INTO tbl_synset (id, definition, isartificial) VALUES (?, ?, ?) """, (syn_node.id, syn_node.definition, syn_node.is_artificial), @@ -1432,7 +1402,7 @@ class _DBBuilder(object): try: cur.execute( u""" - INSERT INTO lexicalunit ( + INSERT INTO tbl_lexicalunit ( id, lemma, pos, variant, synset, unitindex, definition, domain, verbaspect, @@ -1483,7 +1453,7 @@ class _DBBuilder(object): cur.executemany( u""" - INSERT INTO senseexample (unitid, example, source) + INSERT INTO tbl_senseexample (unitid, example, source) VALUES (?, ?, ?) """, ( @@ -1494,14 +1464,14 @@ class _DBBuilder(object): ) cur.executemany( u""" - INSERT INTO usagenote (unitid, note) + INSERT INTO tbl_usagenote (unitid, note) VALUES (?, ?) """, ((lu_node.id, note) for note in lu_node.usage_notes), ) cur.executemany( u""" - INSERT INTO externallink (unitid, link) + INSERT INTO tbl_externallink (unitid, link) VALUES (?, ?) """, ((lu_node.id, link) @@ -1509,7 +1479,7 @@ class _DBBuilder(object): ) cur.executemany( u""" - INSERT INTO unitemotionname (unitid, nameid) + INSERT INTO tbl_unitemotionname (unitid, nameid) VALUES (?, ?) """, ( @@ -1519,7 +1489,7 @@ class _DBBuilder(object): ) cur.executemany( u""" - INSERT INTO unitemotionvaluation (unitid, valuationid) + INSERT INTO tbl_unitemotionvaluation (unitid, valuationid) VALUES (?, ?) """, ( @@ -1656,10 +1626,10 @@ class _DBBuilder(object): with closing(self._db.cursor()) as cur: cur.execute( u""" - SELECT synset.id - FROM synset - LEFT JOIN lexicalunit ON synset.id = lexicalunit.synset - WHERE lexicalunit.synset IS NULL + SELECT tbl_synset.id + FROM tbl_synset + LEFT JOIN tbl_lexicalunit ON tbl_synset.id = tbl_lexicalunit.synset + WHERE tbl_lexicalunit.synset IS NULL """, ) empties = tuple(row[0] for row in cur) @@ -1672,7 +1642,7 @@ class _DBBuilder(object): _LOG.warning('Synset %d is empty', synid) self._db.execute( - u"DELETE FROM synset WHERE id IN ({})".format( + u"DELETE FROM tbl_synset WHERE id IN ({})".format( u','.join(u'?' * len(empties)) ), empties,