From 7ca1bc2a57e5036174b5dd31a117ef809a392b51 Mon Sep 17 00:00:00 2001
From: Maja Jablonska <majajjablonska@gmail.com>
Date: Wed, 22 Nov 2023 02:47:39 +1100
Subject: [PATCH] Add a Prediction.ipynb notebook

---
 notebooks/Prediction.ipynb                    | 355 ++++++++++++++++++
 .../{model_training.ipynb => Training.ipynb}  |   0
 2 files changed, 355 insertions(+)
 create mode 100644 notebooks/Prediction.ipynb
 rename notebooks/{model_training.ipynb => Training.ipynb} (100%)

diff --git a/notebooks/Prediction.ipynb b/notebooks/Prediction.ipynb
new file mode 100644
index 0000000..b0ee520
--- /dev/null
+++ b/notebooks/Prediction.ipynb
@@ -0,0 +1,355 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Prediction\n",
+    "\n",
+    "This notebook will demonstrate how to use a pretrained COMBO model."
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "83e245f2c41847f3"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "outputs": [],
+   "source": [
+    "from combo.predict import COMBO"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-11-21T15:15:18.493035Z",
+     "start_time": "2023-11-21T15:15:14.638080Z"
+    }
+   },
+   "id": "d61cad94b3cde64f"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true,
+    "ExecuteTime": {
+     "end_time": "2023-11-21T15:37:53.106009Z",
+     "start_time": "2023-11-21T15:15:18.482336Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 319M/319M [22:15<00:00, 251kB/s]    \n",
+      "Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.sso.sso_relationship.bias', 'cls.predictions.bias']\n",
+      "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using model LAMBO-UD_Polish-PDB\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": "loading instances: 0it [00:00, ?it/s]",
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "382d00831a14426289956738bd3c4266"
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2023-11-21 15:37:51 UTC Loading archive] Error while loading Training Data Loader: File with path '/net/pr2/projects/plgrid/plgg_nlp/maja/ud-treebanks-v2.13/UD_Polish-PDB/pl_pdb-ud-train.conllu' does not exist!. Setting Data Loader to None\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using model LAMBO-UD_Polish-PDB\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": "loading instances: 0it [00:00, ?it/s]",
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "b973a8d39ad94eaab4167e5de5418e1c"
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2023-11-21 15:37:51 UTC Loading archive] Error while loading Validation Data Loader: File with path '/net/pr2/projects/plgrid/plgg_nlp/maja/ud-treebanks-v2.13/UD_Polish-PDB/pl_pdb-ud-dev.conllu' does not exist!. Setting Data Loader to None\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using model LAMBO-UD_Polish-PDB\n"
+     ]
+    }
+   ],
+   "source": [
+    "predictor = COMBO.from_pretrained('polish-herbert-base-ud213')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Predict a sentence:"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "ad45699f49f49450"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "outputs": [],
+   "source": [
+    "predictions = predictor(\"CzeÅ›Ä‡! To jest przykÅ‚adowe zdanie.\")"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-11-21T15:41:38.889677Z",
+     "start_time": "2023-11-21T15:41:38.443880Z"
+    }
+   },
+   "id": "a397812ec6ffbd8a"
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Predictions are a list of sentences with corresponding metadata."
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "b7bae793bca448cb"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "[Sentence(tokens=[CzeÅ›Ä‡, !], metadata=matrix([[7.14260619e-03, 9.92857337e-01, 1.21313585e-18],\n         [8.81941795e-01, 1.18058220e-01, 8.37145743e-16],\n         [1.13332543e-08, 1.00000000e+00, 6.85589674e-09]])),\n Sentence(tokens=[To, jest, przykÅ‚adowe, zdanie, .], metadata=matrix([[2.30086403e-06, 6.01210148e-09, 1.12105099e-08, 2.96070528e-08,\n          9.99997616e-01, 2.80899673e-14],\n         [1.08029099e-05, 5.30108446e-09, 1.08553975e-08, 1.58541518e-08,\n          9.99989152e-01, 1.11981080e-15],\n         [5.74975857e-05, 8.36746423e-08, 1.03298721e-08, 2.51524526e-08,\n          9.99942422e-01, 2.73261309e-11],\n         [6.84069164e-05, 3.77383458e-09, 4.61269956e-09, 1.75928944e-05,\n          9.99913931e-01, 1.95838829e-13],\n         [9.97742176e-01, 1.03157288e-06, 2.51532147e-05, 5.11952919e-07,\n          2.23120954e-03, 1.63324076e-11],\n         [2.56056478e-06, 4.58555348e-07, 1.24389462e-05, 2.90007080e-07,\n          9.99984264e-01, 2.47912736e-12]]))]"
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "predictions"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-11-21T15:41:45.899260Z",
+     "start_time": "2023-11-21T15:41:45.855424Z"
+    }
+   },
+   "id": "fba5df5411979d52"
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Tokens contain additional predicted features."
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "f582c2ef63f87503"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "[CzeÅ›Ä‡, !]"
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "predictions[0].tokens"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-11-21T15:42:33.217710Z",
+     "start_time": "2023-11-21T15:42:33.165885Z"
+    }
+   },
+   "id": "4772691b96f8c7fa"
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Let's inspect one token:"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "153a7236e2ff56a8"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TOKEN           LEMMA           UPOS       HEAD       DEPREL    \n",
+      "CzeÅ›Ä‡           czeÅ›Ä‡           VERB                0 root      \n"
+     ]
+    }
+   ],
+   "source": [
+    "token = predictions[0].tokens[0]\n",
+    "\n",
+    "print(\"{:15} {:15} {:10} {:10} {:10}\".format('TOKEN', 'LEMMA', 'UPOS', 'HEAD', 'DEPREL'))\n",
+    "print(\"{:15} {:15} {:10} {:10} {:10}\".format(token.text, token.lemma, token.upostag, token.head, token.deprel))"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-11-21T15:43:24.164087Z",
+     "start_time": "2023-11-21T15:43:24.091530Z"
+    }
+   },
+   "id": "173da06d7fef299b"
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Let's print all the predicted tokens. For convenience, let's flatten the predictions first."
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "344445d6a2b8893e"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "[CzeÅ›Ä‡, !, To, jest, przykÅ‚adowe, zdanie, .]"
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokens = [p.tokens for p in predictions]\n",
+    "flat_tokens = []\n",
+    "for t in tokens:\n",
+    "    flat_tokens.extend(t)\n",
+    "flat_tokens"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-11-21T15:46:40.333747Z",
+     "start_time": "2023-11-21T15:46:40.282880Z"
+    }
+   },
+   "id": "718614eb414b6b3a"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TOKEN           LEMMA           UPOS       HEAD       DEPREL    \n",
+      "CzeÅ›Ä‡           czeÅ›Ä‡           VERB                0 root      \n",
+      "!               !               PUNCT               1 punct     \n",
+      "To              to              AUX                 4 cop       \n",
+      "jest            byÄ‡             AUX                 4 aux       \n",
+      "przykÅ‚adowe     przykÅ‚adowy     ADJ                 4 amod      \n",
+      "zdanie          zdanie          NOUN                0 root      \n",
+      ".               .               PUNCT               4 punct     \n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"{:15} {:15} {:10} {:10} {:10}\".format('TOKEN', 'LEMMA', 'UPOS', 'HEAD', 'DEPREL'))\n",
+    "for t in flat_tokens:\n",
+    "    print(\"{:15} {:15} {:10} {:10} {:10}\".format(t.text, t.lemma, t.upostag, t.head, t.deprel))"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-11-21T15:47:03.697133Z",
+     "start_time": "2023-11-21T15:47:03.670136Z"
+    }
+   },
+   "id": "7722d2623fa2faae"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "c939410d13aac684"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/model_training.ipynb b/notebooks/Training.ipynb
similarity index 100%
rename from notebooks/model_training.ipynb
rename to notebooks/Training.ipynb
-- 
GitLab