Add a Prediction.ipynb notebook

7ca1bc2a · Maja Jablonska · d574777b · 7ca1bc2a · 7ca1bc2a
Commit 7ca1bc2a authored Nov 21, 2023 by Maja Jablonska
--- a/notebooks/Prediction.ipynb
+++ b/notebooks/Prediction.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Prediction\n",
+    "\n",
+    "This notebook will demonstrate how to use a pretrained COMBO model."
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "83e245f2c41847f3"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "outputs": [],
+   "source": [
+    "from combo.predict import COMBO"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-11-21T15:15:18.493035Z",
+     "start_time": "2023-11-21T15:15:14.638080Z"
+    }
+   },
+   "id": "d61cad94b3cde64f"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true,
+    "ExecuteTime": {
+     "end_time": "2023-11-21T15:37:53.106009Z",
+     "start_time": "2023-11-21T15:15:18.482336Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 319M/319M [22:15<00:00, 251kB/s]    \n",
+      "Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.sso.sso_relationship.bias', 'cls.predictions.bias']\n",
+      "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using model LAMBO-UD_Polish-PDB\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": "loading instances: 0it [00:00, ?it/s]",
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "382d00831a14426289956738bd3c4266"
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2023-11-21 15:37:51 UTC Loading archive] Error while loading Training Data Loader: File with path '/net/pr2/projects/plgrid/plgg_nlp/maja/ud-treebanks-v2.13/UD_Polish-PDB/pl_pdb-ud-train.conllu' does not exist!. Setting Data Loader to None\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using model LAMBO-UD_Polish-PDB\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": "loading instances: 0it [00:00, ?it/s]",
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "b973a8d39ad94eaab4167e5de5418e1c"
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2023-11-21 15:37:51 UTC Loading archive] Error while loading Validation Data Loader: File with path '/net/pr2/projects/plgrid/plgg_nlp/maja/ud-treebanks-v2.13/UD_Polish-PDB/pl_pdb-ud-dev.conllu' does not exist!. Setting Data Loader to None\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using model LAMBO-UD_Polish-PDB\n"
+     ]
+    }
+   ],
+   "source": [
+    "predictor = COMBO.from_pretrained('polish-herbert-base-ud213')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Predict a sentence:"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "ad45699f49f49450"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "outputs": [],
+   "source": [
+    "predictions = predictor(\"Cześć! To jest przykładowe zdanie.\")"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-11-21T15:41:38.889677Z",
+     "start_time": "2023-11-21T15:41:38.443880Z"
+    }
+   },
+   "id": "a397812ec6ffbd8a"
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Predictions are a list of sentences with corresponding metadata."
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "b7bae793bca448cb"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "[Sentence(tokens=[Cześć, !], metadata=matrix([[7.14260619e-03, 9.92857337e-01, 1.21313585e-18],\n         [8.81941795e-01, 1.18058220e-01, 8.37145743e-16],\n         [1.13332543e-08, 1.00000000e+00, 6.85589674e-09]])),\n Sentence(tokens=[To, jest, przykładowe, zdanie, .], metadata=matrix([[2.30086403e-06, 6.01210148e-09, 1.12105099e-08, 2.96070528e-08,\n          9.99997616e-01, 2.80899673e-14],\n         [1.08029099e-05, 5.30108446e-09, 1.08553975e-08, 1.58541518e-08,\n          9.99989152e-01, 1.11981080e-15],\n         [5.74975857e-05, 8.36746423e-08, 1.03298721e-08, 2.51524526e-08,\n          9.99942422e-01, 2.73261309e-11],\n         [6.84069164e-05, 3.77383458e-09, 4.61269956e-09, 1.75928944e-05,\n          9.99913931e-01, 1.95838829e-13],\n         [9.97742176e-01, 1.03157288e-06, 2.51532147e-05, 5.11952919e-07,\n          2.23120954e-03, 1.63324076e-11],\n         [2.56056478e-06, 4.58555348e-07, 1.24389462e-05, 2.90007080e-07,\n          9.99984264e-01, 2.47912736e-12]]))]"
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "predictions"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-11-21T15:41:45.899260Z",
+     "start_time": "2023-11-21T15:41:45.855424Z"
+    }
+   },
+   "id": "fba5df5411979d52"
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Tokens contain additional predicted features."
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "f582c2ef63f87503"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "[Cześć, !]"
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "predictions[0].tokens"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-11-21T15:42:33.217710Z",
+     "start_time": "2023-11-21T15:42:33.165885Z"
+    }
+   },
+   "id": "4772691b96f8c7fa"
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Let's inspect one token:"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "153a7236e2ff56a8"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TOKEN           LEMMA           UPOS       HEAD       DEPREL    \n",
+      "Cześć           cześć           VERB                0 root      \n"
+     ]
+    }
+   ],
+   "source": [
+    "token = predictions[0].tokens[0]\n",
+    "\n",
+    "print(\"{:15} {:15} {:10} {:10} {:10}\".format('TOKEN', 'LEMMA', 'UPOS', 'HEAD', 'DEPREL'))\n",
+    "print(\"{:15} {:15} {:10} {:10} {:10}\".format(token.text, token.lemma, token.upostag, token.head, token.deprel))"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-11-21T15:43:24.164087Z",
+     "start_time": "2023-11-21T15:43:24.091530Z"
+    }
+   },
+   "id": "173da06d7fef299b"
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Let's print all the predicted tokens. For convenience, let's flatten the predictions first."
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "344445d6a2b8893e"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "[Cześć, !, To, jest, przykładowe, zdanie, .]"
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokens = [p.tokens for p in predictions]\n",
+    "flat_tokens = []\n",
+    "for t in tokens:\n",
+    "    flat_tokens.extend(t)\n",
+    "flat_tokens"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-11-21T15:46:40.333747Z",
+     "start_time": "2023-11-21T15:46:40.282880Z"
+    }
+   },
+   "id": "718614eb414b6b3a"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TOKEN           LEMMA           UPOS       HEAD       DEPREL    \n",
+      "Cześć           cześć           VERB                0 root      \n",
+      "!               !               PUNCT               1 punct     \n",
+      "To              to              AUX                 4 cop       \n",
+      "jest            być             AUX                 4 aux       \n",
+      "przykładowe     przykładowy     ADJ                 4 amod      \n",
+      "zdanie          zdanie          NOUN                0 root      \n",
+      ".               .               PUNCT               4 punct     \n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"{:15} {:15} {:10} {:10} {:10}\".format('TOKEN', 'LEMMA', 'UPOS', 'HEAD', 'DEPREL'))\n",
+    "for t in flat_tokens:\n",
+    "    print(\"{:15} {:15} {:10} {:10} {:10}\".format(t.text, t.lemma, t.upostag, t.head, t.deprel))"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-11-21T15:47:03.697133Z",
+     "start_time": "2023-11-21T15:47:03.670136Z"
+    }
+   },
+   "id": "7722d2623fa2faae"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "c939410d13aac684"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:markdown id:83e245f2c41847f3 tags:
+
+# Prediction
+
+This notebook will demonstrate how to use a pretrained COMBO model.
+
+%% Cell type:code id:d61cad94b3cde64f tags:
+
+``` python
+from combo.predict import COMBO
+```
+
+%% Cell type:code id:initial_id tags:
+
+``` python
+predictor = COMBO.from_pretrained('polish-herbert-base-ud213')
+```
+
+%% Output
+
+    100%|██████████| 319M/319M [22:15<00:00, 251kB/s]
+    Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.sso.sso_relationship.bias', 'cls.predictions.bias']
+    - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
+    - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+
+    Using model LAMBO-UD_Polish-PDB
+
+
+    [2023-11-21 15:37:51 UTC Loading archive] Error while loading Training Data Loader: File with path '/net/pr2/projects/plgrid/plgg_nlp/maja/ud-treebanks-v2.13/UD_Polish-PDB/pl_pdb-ud-train.conllu' does not exist!. Setting Data Loader to None
+
+    Using model LAMBO-UD_Polish-PDB
+
+
+    [2023-11-21 15:37:51 UTC Loading archive] Error while loading Validation Data Loader: File with path '/net/pr2/projects/plgrid/plgg_nlp/maja/ud-treebanks-v2.13/UD_Polish-PDB/pl_pdb-ud-dev.conllu' does not exist!. Setting Data Loader to None
+
+    Using model LAMBO-UD_Polish-PDB
+
+%% Cell type:markdown id:ad45699f49f49450 tags:
+
+Predict a sentence:
+
+%% Cell type:code id:a397812ec6ffbd8a tags:
+
+``` python
+predictions = predictor("Cześć! To jest przykładowe zdanie.")
+```
+
+%% Cell type:markdown id:b7bae793bca448cb tags:
+
+Predictions are a list of sentences with corresponding metadata.
+
+%% Cell type:code id:fba5df5411979d52 tags:
+
+``` python
+predictions
+```
+
+%% Output
+
+[Sentence(tokens=[Cześć, !], metadata=matrix([[7.14260619e-03, 9.92857337e-01, 1.21313585e-18],
+         [8.81941795e-01, 1.18058220e-01, 8.37145743e-16],
+         [1.13332543e-08, 1.00000000e+00, 6.85589674e-09]])),
+ Sentence(tokens=[To, jest, przykładowe, zdanie, .], metadata=matrix([[2.30086403e-06, 6.01210148e-09, 1.12105099e-08, 2.96070528e-08,
+          9.99997616e-01, 2.80899673e-14],
+         [1.08029099e-05, 5.30108446e-09, 1.08553975e-08, 1.58541518e-08,
+          9.99989152e-01, 1.11981080e-15],
+         [5.74975857e-05, 8.36746423e-08, 1.03298721e-08, 2.51524526e-08,
+          9.99942422e-01, 2.73261309e-11],
+         [6.84069164e-05, 3.77383458e-09, 4.61269956e-09, 1.75928944e-05,
+          9.99913931e-01, 1.95838829e-13],
+         [9.97742176e-01, 1.03157288e-06, 2.51532147e-05, 5.11952919e-07,
+          2.23120954e-03, 1.63324076e-11],
+         [2.56056478e-06, 4.58555348e-07, 1.24389462e-05, 2.90007080e-07,
+          9.99984264e-01, 2.47912736e-12]]))]
+
+%% Cell type:markdown id:f582c2ef63f87503 tags:
+
+Tokens contain additional predicted features.
+
+%% Cell type:code id:4772691b96f8c7fa tags:
+
+``` python
+predictions[0].tokens
+```
+
+%% Output
+
+[Cześć, !]
+
+%% Cell type:markdown id:153a7236e2ff56a8 tags:
+
+Let's inspect one token:
+
+%% Cell type:code id:173da06d7fef299b tags:
+
+``` python
+token = predictions[0].tokens[0]
+
+print("{:15} {:15} {:10} {:10} {:10}".format('TOKEN', 'LEMMA', 'UPOS', 'HEAD', 'DEPREL'))
+print("{:15} {:15} {:10} {:10} {:10}".format(token.text, token.lemma, token.upostag, token.head, token.deprel))
+```
+
+%% Output
+
+    TOKEN           LEMMA           UPOS       HEAD       DEPREL
+    Cześć           cześć           VERB                0 root
+
+%% Cell type:markdown id:344445d6a2b8893e tags:
+
+Let's print all the predicted tokens. For convenience, let's flatten the predictions first.
+
+%% Cell type:code id:718614eb414b6b3a tags:
+
+``` python
+tokens = [p.tokens for p in predictions]
+flat_tokens = []
+for t in tokens:
+    flat_tokens.extend(t)
+flat_tokens
+```
+
+%% Output
+
+[Cześć, !, To, jest, przykładowe, zdanie, .]
+
+%% Cell type:code id:7722d2623fa2faae tags:
+
+``` python
+print("{:15} {:15} {:10} {:10} {:10}".format('TOKEN', 'LEMMA', 'UPOS', 'HEAD', 'DEPREL'))
+for t in flat_tokens:
+    print("{:15} {:15} {:10} {:10} {:10}".format(t.text, t.lemma, t.upostag, t.head, t.deprel))
+```
+
+%% Output
+
+    TOKEN           LEMMA           UPOS       HEAD       DEPREL
+    Cześć           cześć           VERB                0 root
+    !               !               PUNCT               1 punct
+    To              to              AUX                 4 cop
+    jest            być             AUX                 4 aux
+    przykładowe     przykładowy     ADJ                 4 amod
+    zdanie          zdanie          NOUN                0 root
+    .               .               PUNCT               4 punct
+
+%% Cell type:code id:c939410d13aac684 tags:
+
+``` python
+```
--- a/notebooks/model_training.ipynb
+++ b/notebooks/model_training.ipynb