Commit add37596 authored by Tomasz Walkowiak's avatar Tomasz Walkowiak

notebook example

parent ce4c9dad
{ {
"cells": [ "cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# WebSty - stylometric features for Polish\n",
"\n",
"http://cmst.eu/wp-content/uploads/files/10.12921_cmst.2018.0000007_PIASECKI_c.pdf\n",
"\n",
"Results in - zip file with set of json files containing:\n",
"<ul>\n",
"<li> feature matrix: data (raw countes), weighted (for example by mi-simple) and transform (by default equal to weighted)</li><li> distances and similarities between documents - calculated from transform.json using similarity/dictance metrix (cosine)</li>\n",
"\n"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 1,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Uploaded with id /users/default/e7064a0d-c756-4d3e-8a2c-4b0b72143f51\n", "Uploaded with id /users/default/7a46fb4f-1639-4726-8ca7-55eb4ed4df93\n",
"Task ID: bca13a9d-ccf3-41cb-978f-68327b606e5a\n", "Task ID: da11cb9d-d4a0-4869-bd73-88f7e67ce4e9\n",
"[{'name': 'dane', 'fileID': '/requests/feature2/932cf156-c45a-407f-8434-a2a5c7efbcbe'}]\n", "[{'name': 'dane', 'fileID': '/requests/makezip/5e09f59e-d42b-4453-b71f-68579033b4e7'}]\n",
"Processing finished\n", "Processing finished\n",
"File saved!\n", "File saved!\n",
"GLOBAL 3.3759539127349854 seconds ---\n" "GLOBAL 23.970514059066772 seconds ---\n"
] ]
} }
], ],
...@@ -28,9 +42,10 @@ ...@@ -28,9 +42,10 @@
"user = \"mojadresemail@przyklad.pl\" \n", "user = \"mojadresemail@przyklad.pl\" \n",
"\n", "\n",
"#with document division into smaller parts - 20 kbytes\n", "#with document division into smaller parts - 20 kbytes\n",
"task = 'any2txt|div(20000)|doc2vec({\"lang\":\"polish\")|dir|feature2({\"type\":\"word2vec\"})'\n", "task = 'any2txt|div(20000)|wcrft2|fextor2({\"features\":\"base interp_signs bigrams\",\"base_modification\":\"startlist\",\"orth_modification\":\"startlist\",\"lang\":\"pl\",\"filters\":{\"base\":[{\"type\":\"lemma_stoplist\",\"args\":{\"stoplist\":\"@resources/fextor/ml/polish_base_startlist.txt\"}}]}})|dir|featfilt2({\"weighting\":\"all:sm-mi_simple\",\"filter\":\"min_tf-1 min_df-1\",\"similarity\":\"cosine\"})|makezip'\n",
"#whole documents\n", "#whole documents\n",
"task = 'any2txt|doc2vec({\"lang\":\"polish\"})|dir|feature2({\"type\":\"word2vec\"})'\n", "task = 'any2txt|wcrft2|fextor2({\"features\":\"base interp_signs bigrams\",\"base_modification\":\"startlist\",\"orth_modification\":\"startlist\",\"lang\":\"pl\",\"filters\":{\"base\":[{\"type\":\"lemma_stoplist\",\"args\":{\"stoplist\":\"@resources/fextor/ml/polish_base_startlist.txt\"}}]}})|dir|featfilt2({\"weighting\":\"all:sm-mi_simple\",\"filter\":\"min_tf-1 min_df-1\",\"similarity\":\"cosine\"})|makezip'\n",
"\n",
"url = \"http://ws.clarin-pl.eu/nlprest2/base\" \n", "url = \"http://ws.clarin-pl.eu/nlprest2/base\" \n",
"\n", "\n",
"def upload(file):\n", "def upload(file):\n",
...@@ -60,7 +75,7 @@ ...@@ -60,7 +75,7 @@
"\n", "\n",
"def main():\n", "def main():\n",
" in_file = 'corpus.zip'\n", " in_file = 'corpus.zip'\n",
" out_file = 'out.json'\n", " out_file = 'out.zip'\n",
" global_time = time.time()\n", " global_time = time.time()\n",
" fileid = upload(in_file)\n", " fileid = upload(in_file)\n",
" print(\"Uploaded with id \" + fileid)\n", " print(\"Uploaded with id \" + fileid)\n",
...@@ -70,7 +85,7 @@ ...@@ -70,7 +85,7 @@
" print(\"Processing finished\")\n", " print(\"Processing finished\")\n",
" if data != None:\n", " if data != None:\n",
" data = data[0][\"fileID\"];\n", " data = data[0][\"fileID\"];\n",
" content = requests.get(url + '/download' + data + \"/weighted.json\");\n", " content = requests.get(url + '/download' + data);\n",
" with open (out_file, \"wb\") as outfile:\n", " with open (out_file, \"wb\") as outfile:\n",
" outfile.write(content.content)\n", " outfile.write(content.content)\n",
" \n", " \n",
...@@ -111,7 +126,7 @@ ...@@ -111,7 +126,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.7.4" "version": "3.7.3"
} }
}, },
"nbformat": 4, "nbformat": 4,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment