Syntactic Tools
lambo

Repository

git clone https://gitlab.clarin-pl.eu/syntactic-tools/lambo.git
pip install ./lambo
from lambo.segmenter.lambo import Lambo
lambo = Lambo.get('English')
lambo = Lambo.get('LAMBO-UD_Polish-PDB')
text = "Simple sentences can't be enough... Some of us just ❤️ emojis. They should be tokens even when (yy) containing many characters, such as 👍🏿."
document = lambo.segment(text)
for turn in document.turns:
    print('======= TURN =======')
    print('TEXT: ' + turn.text[:100] + '...')
    for sentence in turn.sentences:
        print('======= SENTENCE =======')
        print('TEXT: "' + sentence.text + '"')
        formatted = ''
        for token in sentence.tokens:
            if token.is_multi_word:
               formatted += '(' + token.text+ '=' + '-'.join(token.subwords) + ')'
            else:
               formatted += '(' + token.text + ')'
        print('TOKENS: ' + formatted)
======= TURN =======
TEXT: Simple sentences can't be enough... Some of us just ❤️ emojis. They should be tokens even when (yy) ...
======= SENTENCE =======
TEXT: "Simple sentences can't be enough... "
TOKENS: (Simple)(sentences)(can't=ca-n't)(be)(enough)(...)
======= SENTENCE =======
TEXT: "Some of us just ❤️ emojis. "
TOKENS: (Some)(of)(us)(just)(❤️)(emojis)(.)
======= SENTENCE =======
TEXT: "They should be tokens even when (yy) containing many characters, such as 👍🏿."
TOKENS: (They)(should)(be)(tokens)(even)(when)((yy))(containing)(many)(characters)(,)(such)(as)(👍🏿)(.)
from combo.predict import COMBO
from combo.utils import lambo
nlp_new = COMBO.from_pretrained("polish-herbert-base-ud29",tokenizer=lambo.LamboTokenizer("pl"))
text = ["To zdanie jest OK 👍🏿.", "To jest drugie zdanie."]
sentences = nlp_new(text)
text="To zdanie jest OK 👍🏿. To jest drugie zdanie."
sentences = nlp_new(text)
print("{:5} {:15} {:15} {:10} {:10} {:10}".format('ID', 'TOKEN', 'LEMMA', 'UPOS', 'HEAD', 'DEPREL'))
	for sentence in sentences:
		for token in sentence.tokens:
			print("{:5} {:15} {:15} {:10} {:10} {:10}".format(str(token.id), token.token, token.lemma, token.upostag, str(token.head), token.deprel))
		print("\n")
@software{LAMBO,
  author = {{Przyby{\l}a, Piotr}},
  title = {LAMBO: Layered Approach to Multi-level BOundary identification},
  url = {https://gitlab.clarin-pl.eu/syntactic-tools/lambo},
  version = {2.0.0},
  year = {2022},
}