From 7ca1bc2a57e5036174b5dd31a117ef809a392b51 Mon Sep 17 00:00:00 2001 From: Maja Jablonska <majajjablonska@gmail.com> Date: Wed, 22 Nov 2023 02:47:39 +1100 Subject: [PATCH] Add a Prediction.ipynb notebook --- notebooks/Prediction.ipynb | 355 ++++++++++++++++++ .../{model_training.ipynb => Training.ipynb} | 0 2 files changed, 355 insertions(+) create mode 100644 notebooks/Prediction.ipynb rename notebooks/{model_training.ipynb => Training.ipynb} (100%) diff --git a/notebooks/Prediction.ipynb b/notebooks/Prediction.ipynb new file mode 100644 index 0000000..b0ee520 --- /dev/null +++ b/notebooks/Prediction.ipynb @@ -0,0 +1,355 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Prediction\n", + "\n", + "This notebook will demonstrate how to use a pretrained COMBO model." + ], + "metadata": { + "collapsed": false + }, + "id": "83e245f2c41847f3" + }, + { + "cell_type": "code", + "execution_count": 1, + "outputs": [], + "source": [ + "from combo.predict import COMBO" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-11-21T15:15:18.493035Z", + "start_time": "2023-11-21T15:15:14.638080Z" + } + }, + "id": "d61cad94b3cde64f" + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "initial_id", + "metadata": { + "collapsed": true, + "ExecuteTime": { + "end_time": "2023-11-21T15:37:53.106009Z", + "start_time": "2023-11-21T15:15:18.482336Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 319M/319M [22:15<00:00, 251kB/s] \n", + "Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.sso.sso_relationship.bias', 'cls.predictions.bias']\n", + "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using model LAMBO-UD_Polish-PDB\n" + ] + }, + { + "data": { + "text/plain": "loading instances: 0it [00:00, ?it/s]", + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "382d00831a14426289956738bd3c4266" + } + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2023-11-21 15:37:51 UTC Loading archive] Error while loading Training Data Loader: File with path '/net/pr2/projects/plgrid/plgg_nlp/maja/ud-treebanks-v2.13/UD_Polish-PDB/pl_pdb-ud-train.conllu' does not exist!. Setting Data Loader to None\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using model LAMBO-UD_Polish-PDB\n" + ] + }, + { + "data": { + "text/plain": "loading instances: 0it [00:00, ?it/s]", + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "b973a8d39ad94eaab4167e5de5418e1c" + } + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2023-11-21 15:37:51 UTC Loading archive] Error while loading Validation Data Loader: File with path '/net/pr2/projects/plgrid/plgg_nlp/maja/ud-treebanks-v2.13/UD_Polish-PDB/pl_pdb-ud-dev.conllu' does not exist!. Setting Data Loader to None\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using model LAMBO-UD_Polish-PDB\n" + ] + } + ], + "source": [ + "predictor = COMBO.from_pretrained('polish-herbert-base-ud213')" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Predict a sentence:" + ], + "metadata": { + "collapsed": false + }, + "id": "ad45699f49f49450" + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [], + "source": [ + "predictions = predictor(\"Cześć! To jest przykładowe zdanie.\")" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-11-21T15:41:38.889677Z", + "start_time": "2023-11-21T15:41:38.443880Z" + } + }, + "id": "a397812ec6ffbd8a" + }, + { + "cell_type": "markdown", + "source": [ + "Predictions are a list of sentences with corresponding metadata." + ], + "metadata": { + "collapsed": false + }, + "id": "b7bae793bca448cb" + }, + { + "cell_type": "code", + "execution_count": 7, + "outputs": [ + { + "data": { + "text/plain": "[Sentence(tokens=[Cześć, !], metadata=matrix([[7.14260619e-03, 9.92857337e-01, 1.21313585e-18],\n [8.81941795e-01, 1.18058220e-01, 8.37145743e-16],\n [1.13332543e-08, 1.00000000e+00, 6.85589674e-09]])),\n Sentence(tokens=[To, jest, przykładowe, zdanie, .], metadata=matrix([[2.30086403e-06, 6.01210148e-09, 1.12105099e-08, 2.96070528e-08,\n 9.99997616e-01, 2.80899673e-14],\n [1.08029099e-05, 5.30108446e-09, 1.08553975e-08, 1.58541518e-08,\n 9.99989152e-01, 1.11981080e-15],\n [5.74975857e-05, 8.36746423e-08, 1.03298721e-08, 2.51524526e-08,\n 9.99942422e-01, 2.73261309e-11],\n [6.84069164e-05, 3.77383458e-09, 4.61269956e-09, 1.75928944e-05,\n 9.99913931e-01, 1.95838829e-13],\n [9.97742176e-01, 1.03157288e-06, 2.51532147e-05, 5.11952919e-07,\n 2.23120954e-03, 1.63324076e-11],\n [2.56056478e-06, 4.58555348e-07, 1.24389462e-05, 2.90007080e-07,\n 9.99984264e-01, 2.47912736e-12]]))]" + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predictions" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-11-21T15:41:45.899260Z", + "start_time": "2023-11-21T15:41:45.855424Z" + } + }, + "id": "fba5df5411979d52" + }, + { + "cell_type": "markdown", + "source": [ + "Tokens contain additional predicted features." + ], + "metadata": { + "collapsed": false + }, + "id": "f582c2ef63f87503" + }, + { + "cell_type": "code", + "execution_count": 8, + "outputs": [ + { + "data": { + "text/plain": "[Cześć, !]" + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predictions[0].tokens" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-11-21T15:42:33.217710Z", + "start_time": "2023-11-21T15:42:33.165885Z" + } + }, + "id": "4772691b96f8c7fa" + }, + { + "cell_type": "markdown", + "source": [ + "Let's inspect one token:" + ], + "metadata": { + "collapsed": false + }, + "id": "153a7236e2ff56a8" + }, + { + "cell_type": "code", + "execution_count": 12, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TOKEN LEMMA UPOS HEAD DEPREL \n", + "Cześć cześć VERB 0 root \n" + ] + } + ], + "source": [ + "token = predictions[0].tokens[0]\n", + "\n", + "print(\"{:15} {:15} {:10} {:10} {:10}\".format('TOKEN', 'LEMMA', 'UPOS', 'HEAD', 'DEPREL'))\n", + "print(\"{:15} {:15} {:10} {:10} {:10}\".format(token.text, token.lemma, token.upostag, token.head, token.deprel))" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-11-21T15:43:24.164087Z", + "start_time": "2023-11-21T15:43:24.091530Z" + } + }, + "id": "173da06d7fef299b" + }, + { + "cell_type": "markdown", + "source": [ + "Let's print all the predicted tokens. For convenience, let's flatten the predictions first." + ], + "metadata": { + "collapsed": false + }, + "id": "344445d6a2b8893e" + }, + { + "cell_type": "code", + "execution_count": 18, + "outputs": [ + { + "data": { + "text/plain": "[Cześć, !, To, jest, przykładowe, zdanie, .]" + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokens = [p.tokens for p in predictions]\n", + "flat_tokens = []\n", + "for t in tokens:\n", + " flat_tokens.extend(t)\n", + "flat_tokens" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-11-21T15:46:40.333747Z", + "start_time": "2023-11-21T15:46:40.282880Z" + } + }, + "id": "718614eb414b6b3a" + }, + { + "cell_type": "code", + "execution_count": 19, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TOKEN LEMMA UPOS HEAD DEPREL \n", + "Cześć cześć VERB 0 root \n", + "! ! PUNCT 1 punct \n", + "To to AUX 4 cop \n", + "jest być AUX 4 aux \n", + "przykładowe przykładowy ADJ 4 amod \n", + "zdanie zdanie NOUN 0 root \n", + ". . PUNCT 4 punct \n" + ] + } + ], + "source": [ + "print(\"{:15} {:15} {:10} {:10} {:10}\".format('TOKEN', 'LEMMA', 'UPOS', 'HEAD', 'DEPREL'))\n", + "for t in flat_tokens:\n", + " print(\"{:15} {:15} {:10} {:10} {:10}\".format(t.text, t.lemma, t.upostag, t.head, t.deprel))" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-11-21T15:47:03.697133Z", + "start_time": "2023-11-21T15:47:03.670136Z" + } + }, + "id": "7722d2623fa2faae" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + }, + "id": "c939410d13aac684" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/model_training.ipynb b/notebooks/Training.ipynb similarity index 100% rename from notebooks/model_training.ipynb rename to notebooks/Training.ipynb -- GitLab