From 92e65047012d53b4c8cedd5e869a8c66d9fdf6bf Mon Sep 17 00:00:00 2001
From: piotrmp <piotr.m.przybyla@gmail.com>
Date: Fri, 15 Mar 2024 15:16:11 +0100
Subject: [PATCH] Finalised transition to models based on 2.13 UD and using UNK
 sowing.

---
 README.md                            | 15 +++++----------
 pyproject.toml                       |  2 +-
 src/lambo/examples/run_evaluation.py |  2 +-
 src/lambo/examples/run_usage.py      |  4 ++--
 src/lambo/segmenter/lambo.py         |  3 +--
 src/lambo/utils/download.py          |  7 +++++--
 6 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 03eb57a..d788af8 100644
--- a/README.md
+++ b/README.md
@@ -26,14 +26,9 @@ Installation of LAMBO is easy.
 
 1. First, you need to prepare an environment with Python, at least 3.10,
 
-2. Then, download LAMBO from this repository:
+2. Then, install LAMBO as follows:
 ```
-git clone https://gitlab.clarin-pl.eu/syntactic-tools/lambo.git
-```
-
-3. Install LAMBO:
-```
-pip install ./lambo
+pip install --index-url https://pypi.clarin-pl.eu/ lambo
 ```
 
 You now have LAMBO installed in your environment.
@@ -51,9 +46,9 @@ lambo = Lambo.get('English')
 ```
 This will (if necessary) download the appropriate model from the online repository and load it. Note that you can use any language name (e.g. `Ancient_Greek`) or ISO 639-1 code (e.g. `fi`) from [`languages.txt`](src/lambo/resources/languages.txt).
 
-Alternatively, you can select a specific model by defining LAMBO variant (`LAMBO` or `LAMBO_no_pretraining`) and training dataset from [`languages.txt`](src/lambo/resources/languages.txt):
+Alternatively, you can select a specific model by defining LAMBO variant (`LAMBO_213` is the newest one) and training dataset from [`languages.txt`](src/lambo/resources/languages.txt):
 ```
-lambo = Lambo.get('LAMBO-UD_Polish-PDB')
+lambo = Lambo.get('LAMBO_213-UD_Polish-PDB')
 ```
 
 There are two optional arguments to the `get()` function:
@@ -159,7 +154,7 @@ If you use LAMBO in your research, please cite it as software:
   author = {{Przyby{\l}a, Piotr}},
   title = {LAMBO: Layered Approach to Multi-level BOundary identification},
   url = {https://gitlab.clarin-pl.eu/syntactic-tools/lambo},
-  version = {2.0.0},
+  version = {2.2},
   year = {2022},
 }
 ```
diff --git a/pyproject.toml b/pyproject.toml
index ffa16a6..0c10026 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "lambo"
-version = "2.1.1"
+version = "2.2"
 authors = [
   { name="Piotr Przybyła", email="piotr.przybyla@ipipan.waw.pl" },
 ]
diff --git a/src/lambo/examples/run_evaluation.py b/src/lambo/examples/run_evaluation.py
index 79b2632..8949cec 100644
--- a/src/lambo/examples/run_evaluation.py
+++ b/src/lambo/examples/run_evaluation.py
@@ -11,7 +11,7 @@ from lambo.segmenter.spacy import Spacy_segmenter
 
 if __name__ == '__main__':
     modelpath = Path.home() / 'data/lambo/models/full211-s/'
-    modelPpath = Path.home() / 'data/lambo/models/full213-withunk/'
+    modelPpath = Path.home() / 'data/lambo/models/full213-s-withunk/'
     tmp_path = Path.home() / 'data/lambo/out/tmp.conllu'
     
     treebanks = [line.split(' ')[0] for line in
diff --git a/src/lambo/examples/run_usage.py b/src/lambo/examples/run_usage.py
index 2f33f5a..7c01692 100644
--- a/src/lambo/examples/run_usage.py
+++ b/src/lambo/examples/run_usage.py
@@ -7,10 +7,10 @@ import pathlib
 if __name__ == '__main__':
     
     # Load the recommended model for Polish
-    lambo = Lambo.from_path(pathlib.Path.home() / 'data' / 'lambo'/ 'models' / 'withunk','UD_Polish-PDB', False)
+    lambo = Lambo.get('English')
     
     # Provide text, including pauses (``(yy)``), emojis and turn markers (``<turn>``).
-    text = "Poza Japonią, począwszy od cesarza Shōwa, cesarzy często nazywano ich imionami, zarówno za życia, jak po śmierci."
+    text = "Simple sentences can't be enough... Some of us just ❤️ emojis. They should be tokens even when (yy) containing many characters, such as 👍🏿."
     
     # Perform segmentation
     document = lambo.segment(text)
diff --git a/src/lambo/segmenter/lambo.py b/src/lambo/segmenter/lambo.py
index 7e8e634..e0f4f68 100644
--- a/src/lambo/segmenter/lambo.py
+++ b/src/lambo/segmenter/lambo.py
@@ -8,7 +8,7 @@ from lambo.data.sentence import Sentence
 from lambo.data.token import Token
 from lambo.data.turn import Turn
 from lambo.learning.preprocessing_dict import prepare_test_withdict
-from lambo.utils.download import download_model
+from lambo.utils.download import download_model, default_type
 from lambo.utils.special_tokens import detect_special_tokens
 
 # Reading the turn separator file
@@ -55,7 +55,6 @@ class Lambo():
         :param provided_name: language, specified either as name (``Polish``) or ISO 639-1 code (``pl``)
         :return: full model name
         """
-        default_type = 'LAMBO'
         treebank = None
         for line in resources.read_text('lambo.resources', 'languages.txt', encoding='utf-8', errors='strict').split(
                 '\n'):
diff --git a/src/lambo/utils/download.py b/src/lambo/utils/download.py
index a54e10e..84d522d 100644
--- a/src/lambo/utils/download.py
+++ b/src/lambo/utils/download.py
@@ -16,8 +16,11 @@ logger = logging.getLogger(__name__)
 
 # The types of models available and their subdirectories in the model repository
 TYPE_TO_PATH = {
-    "LAMBO_no_pretraining": "vanilla211-s",
-    "LAMBO": "full211-s"}
+    "LAMBO_211_no_pretraining": "vanilla211-s",
+    "LAMBO_211": "full211-s",
+    "LAMBO_213": "full213-s-withunk"}
+
+default_type = 'LAMBO_213'
 
 # The adress of the remote repository
 _URL = "http://home.ipipan.waw.pl/p.przybyla/lambo/{type}/{treebank}.{extension}"
-- 
GitLab