Skip to content
Snippets Groups Projects
Commit 92e65047 authored by piotrmp's avatar piotrmp
Browse files

Finalised transition to models based on 2.13 UD and using UNK sowing.

parent c3f709f6
Branches
Tags
No related merge requests found
Pipeline #17015 passed
......@@ -26,14 +26,9 @@ Installation of LAMBO is easy.
1. First, you need to prepare an environment with Python, at least 3.10,
2. Then, download LAMBO from this repository:
2. Then, install LAMBO as follows:
```
git clone https://gitlab.clarin-pl.eu/syntactic-tools/lambo.git
```
3. Install LAMBO:
```
pip install ./lambo
pip install --index-url https://pypi.clarin-pl.eu/ lambo
```
You now have LAMBO installed in your environment.
......@@ -51,9 +46,9 @@ lambo = Lambo.get('English')
```
This will (if necessary) download the appropriate model from the online repository and load it. Note that you can use any language name (e.g. `Ancient_Greek`) or ISO 639-1 code (e.g. `fi`) from [`languages.txt`](src/lambo/resources/languages.txt).
Alternatively, you can select a specific model by defining LAMBO variant (`LAMBO` or `LAMBO_no_pretraining`) and training dataset from [`languages.txt`](src/lambo/resources/languages.txt):
Alternatively, you can select a specific model by defining LAMBO variant (`LAMBO_213` is the newest one) and training dataset from [`languages.txt`](src/lambo/resources/languages.txt):
```
lambo = Lambo.get('LAMBO-UD_Polish-PDB')
lambo = Lambo.get('LAMBO_213-UD_Polish-PDB')
```
There are two optional arguments to the `get()` function:
......@@ -159,7 +154,7 @@ If you use LAMBO in your research, please cite it as software:
author = {{Przyby{\l}a, Piotr}},
title = {LAMBO: Layered Approach to Multi-level BOundary identification},
url = {https://gitlab.clarin-pl.eu/syntactic-tools/lambo},
version = {2.0.0},
version = {2.2},
year = {2022},
}
```
......
......@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
[project]
name = "lambo"
version = "2.1.1"
version = "2.2"
authors = [
{ name="Piotr Przybyła", email="piotr.przybyla@ipipan.waw.pl" },
]
......
......@@ -11,7 +11,7 @@ from lambo.segmenter.spacy import Spacy_segmenter
if __name__ == '__main__':
modelpath = Path.home() / 'data/lambo/models/full211-s/'
modelPpath = Path.home() / 'data/lambo/models/full213-withunk/'
modelPpath = Path.home() / 'data/lambo/models/full213-s-withunk/'
tmp_path = Path.home() / 'data/lambo/out/tmp.conllu'
treebanks = [line.split(' ')[0] for line in
......
......@@ -7,10 +7,10 @@ import pathlib
if __name__ == '__main__':
# Load the recommended model for Polish
lambo = Lambo.from_path(pathlib.Path.home() / 'data' / 'lambo'/ 'models' / 'withunk','UD_Polish-PDB', False)
lambo = Lambo.get('English')
# Provide text, including pauses (``(yy)``), emojis and turn markers (``<turn>``).
text = "Poza Japonią, począwszy od cesarza Shōwa, cesarzy często nazywano ich imionami, zarówno za życia, jak po śmierci."
text = "Simple sentences can't be enough... Some of us just ❤️ emojis. They should be tokens even when (yy) containing many characters, such as 👍🏿."
# Perform segmentation
document = lambo.segment(text)
......
......@@ -8,7 +8,7 @@ from lambo.data.sentence import Sentence
from lambo.data.token import Token
from lambo.data.turn import Turn
from lambo.learning.preprocessing_dict import prepare_test_withdict
from lambo.utils.download import download_model
from lambo.utils.download import download_model, default_type
from lambo.utils.special_tokens import detect_special_tokens
# Reading the turn separator file
......@@ -55,7 +55,6 @@ class Lambo():
:param provided_name: language, specified either as name (``Polish``) or ISO 639-1 code (``pl``)
:return: full model name
"""
default_type = 'LAMBO'
treebank = None
for line in resources.read_text('lambo.resources', 'languages.txt', encoding='utf-8', errors='strict').split(
'\n'):
......
......@@ -16,8 +16,11 @@ logger = logging.getLogger(__name__)
# The types of models available and their subdirectories in the model repository
TYPE_TO_PATH = {
"LAMBO_no_pretraining": "vanilla211-s",
"LAMBO": "full211-s"}
"LAMBO_211_no_pretraining": "vanilla211-s",
"LAMBO_211": "full211-s",
"LAMBO_213": "full213-s-withunk"}
default_type = 'LAMBO_213'
# The adress of the remote repository
_URL = "http://home.ipipan.waw.pl/p.przybyla/lambo/{type}/{treebank}.{extension}"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment