diff --git a/README.md b/README.md index 03eb57a1ad58eb0887e0238bf8f6bbb97fd9ae52..d788af86b17b75b18f568fdd149dc46271a58a72 100644 --- a/README.md +++ b/README.md @@ -26,14 +26,9 @@ Installation of LAMBO is easy. 1. First, you need to prepare an environment with Python, at least 3.10, -2. Then, download LAMBO from this repository: +2. Then, install LAMBO as follows: ``` -git clone https://gitlab.clarin-pl.eu/syntactic-tools/lambo.git -``` - -3. Install LAMBO: -``` -pip install ./lambo +pip install --index-url https://pypi.clarin-pl.eu/ lambo ``` You now have LAMBO installed in your environment. @@ -51,9 +46,9 @@ lambo = Lambo.get('English') ``` This will (if necessary) download the appropriate model from the online repository and load it. Note that you can use any language name (e.g. `Ancient_Greek`) or ISO 639-1 code (e.g. `fi`) from [`languages.txt`](src/lambo/resources/languages.txt). -Alternatively, you can select a specific model by defining LAMBO variant (`LAMBO` or `LAMBO_no_pretraining`) and training dataset from [`languages.txt`](src/lambo/resources/languages.txt): +Alternatively, you can select a specific model by defining LAMBO variant (`LAMBO_213` is the newest one) and training dataset from [`languages.txt`](src/lambo/resources/languages.txt): ``` -lambo = Lambo.get('LAMBO-UD_Polish-PDB') +lambo = Lambo.get('LAMBO_213-UD_Polish-PDB') ``` There are two optional arguments to the `get()` function: @@ -159,7 +154,7 @@ If you use LAMBO in your research, please cite it as software: author = {{Przyby{\l}a, Piotr}}, title = {LAMBO: Layered Approach to Multi-level BOundary identification}, url = {https://gitlab.clarin-pl.eu/syntactic-tools/lambo}, - version = {2.0.0}, + version = {2.2}, year = {2022}, } ``` diff --git a/pyproject.toml b/pyproject.toml index ffa16a6de8cdb73953e3c15a69d24c29a3ed83e6..0c10026d1f799ffb8bb5f4de3f090d54f99c5e5c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "lambo" -version = "2.1.1" +version = "2.2" authors = [ { name="Piotr Przybyła", email="piotr.przybyla@ipipan.waw.pl" }, ] diff --git a/src/lambo/examples/run_evaluation.py b/src/lambo/examples/run_evaluation.py index 79b2632f27857260569b0b952ca083198f7de5c3..8949cecf3b248be91e2cc205e785b0eda37c42a8 100644 --- a/src/lambo/examples/run_evaluation.py +++ b/src/lambo/examples/run_evaluation.py @@ -11,7 +11,7 @@ from lambo.segmenter.spacy import Spacy_segmenter if __name__ == '__main__': modelpath = Path.home() / 'data/lambo/models/full211-s/' - modelPpath = Path.home() / 'data/lambo/models/full213-withunk/' + modelPpath = Path.home() / 'data/lambo/models/full213-s-withunk/' tmp_path = Path.home() / 'data/lambo/out/tmp.conllu' treebanks = [line.split(' ')[0] for line in diff --git a/src/lambo/examples/run_usage.py b/src/lambo/examples/run_usage.py index 2f33f5a5f607b8d190128bce0ddd142594ce942b..7c01692cf5793f832248d04293923d7ee2d8bb2a 100644 --- a/src/lambo/examples/run_usage.py +++ b/src/lambo/examples/run_usage.py @@ -7,10 +7,10 @@ import pathlib if __name__ == '__main__': # Load the recommended model for Polish - lambo = Lambo.from_path(pathlib.Path.home() / 'data' / 'lambo'/ 'models' / 'withunk','UD_Polish-PDB', False) + lambo = Lambo.get('English') # Provide text, including pauses (``(yy)``), emojis and turn markers (``<turn>``). - text = "Poza Japonią, począwszy od cesarza Shōwa, cesarzy często nazywano ich imionami, zarówno za życia, jak po śmierci." + text = "Simple sentences can't be enough... Some of us just ❤️ emojis. They should be tokens even when (yy) containing many characters, such as 👍🏿." # Perform segmentation document = lambo.segment(text) diff --git a/src/lambo/segmenter/lambo.py b/src/lambo/segmenter/lambo.py index 7e8e6346772b3ce61af9ea903cd430af99b54586..e0f4f68298542a77b3e21ff3dbf62890f73117a2 100644 --- a/src/lambo/segmenter/lambo.py +++ b/src/lambo/segmenter/lambo.py @@ -8,7 +8,7 @@ from lambo.data.sentence import Sentence from lambo.data.token import Token from lambo.data.turn import Turn from lambo.learning.preprocessing_dict import prepare_test_withdict -from lambo.utils.download import download_model +from lambo.utils.download import download_model, default_type from lambo.utils.special_tokens import detect_special_tokens # Reading the turn separator file @@ -55,7 +55,6 @@ class Lambo(): :param provided_name: language, specified either as name (``Polish``) or ISO 639-1 code (``pl``) :return: full model name """ - default_type = 'LAMBO' treebank = None for line in resources.read_text('lambo.resources', 'languages.txt', encoding='utf-8', errors='strict').split( '\n'): diff --git a/src/lambo/utils/download.py b/src/lambo/utils/download.py index a54e10e2a519aa3af34f0361a0a79052dcf1509a..84d522db0c8f81fd44c39e99796b16902fb3ddb3 100644 --- a/src/lambo/utils/download.py +++ b/src/lambo/utils/download.py @@ -16,8 +16,11 @@ logger = logging.getLogger(__name__) # The types of models available and their subdirectories in the model repository TYPE_TO_PATH = { - "LAMBO_no_pretraining": "vanilla211-s", - "LAMBO": "full211-s"} + "LAMBO_211_no_pretraining": "vanilla211-s", + "LAMBO_211": "full211-s", + "LAMBO_213": "full213-s-withunk"} + +default_type = 'LAMBO_213' # The adress of the remote repository _URL = "http://home.ipipan.waw.pl/p.przybyla/lambo/{type}/{treebank}.{extension}"