Commit add37596 authored by Tomasz Walkowiak's avatar Tomasz Walkowiak

notebook example

parent ce4c9dad
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# WebSty - stylometric features for Polish\n",
"\n",
"http://cmst.eu/wp-content/uploads/files/10.12921_cmst.2018.0000007_PIASECKI_c.pdf\n",
"\n",
"Results in - zip file with set of json files containing:\n",
"<ul>\n",
"<li> feature matrix: data (raw countes), weighted (for example by mi-simple) and transform (by default equal to weighted)</li><li> distances and similarities between documents - calculated from transform.json using similarity/dictance metrix (cosine)</li>\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Uploaded with id /users/default/e7064a0d-c756-4d3e-8a2c-4b0b72143f51\n",
"Task ID: bca13a9d-ccf3-41cb-978f-68327b606e5a\n",
"[{'name': 'dane', 'fileID': '/requests/feature2/932cf156-c45a-407f-8434-a2a5c7efbcbe'}]\n",
"Uploaded with id /users/default/7a46fb4f-1639-4726-8ca7-55eb4ed4df93\n",
"Task ID: da11cb9d-d4a0-4869-bd73-88f7e67ce4e9\n",
"[{'name': 'dane', 'fileID': '/requests/makezip/5e09f59e-d42b-4453-b71f-68579033b4e7'}]\n",
"Processing finished\n",
"File saved!\n",
"GLOBAL 3.3759539127349854 seconds ---\n"
"GLOBAL 23.970514059066772 seconds ---\n"
]
}
],
......@@ -28,9 +42,10 @@
"user = \"mojadresemail@przyklad.pl\" \n",
"\n",
"#with document division into smaller parts - 20 kbytes\n",
"task = 'any2txt|div(20000)|doc2vec({\"lang\":\"polish\")|dir|feature2({\"type\":\"word2vec\"})'\n",
"task = 'any2txt|div(20000)|wcrft2|fextor2({\"features\":\"base interp_signs bigrams\",\"base_modification\":\"startlist\",\"orth_modification\":\"startlist\",\"lang\":\"pl\",\"filters\":{\"base\":[{\"type\":\"lemma_stoplist\",\"args\":{\"stoplist\":\"@resources/fextor/ml/polish_base_startlist.txt\"}}]}})|dir|featfilt2({\"weighting\":\"all:sm-mi_simple\",\"filter\":\"min_tf-1 min_df-1\",\"similarity\":\"cosine\"})|makezip'\n",
"#whole documents\n",
"task = 'any2txt|doc2vec({\"lang\":\"polish\"})|dir|feature2({\"type\":\"word2vec\"})'\n",
"task = 'any2txt|wcrft2|fextor2({\"features\":\"base interp_signs bigrams\",\"base_modification\":\"startlist\",\"orth_modification\":\"startlist\",\"lang\":\"pl\",\"filters\":{\"base\":[{\"type\":\"lemma_stoplist\",\"args\":{\"stoplist\":\"@resources/fextor/ml/polish_base_startlist.txt\"}}]}})|dir|featfilt2({\"weighting\":\"all:sm-mi_simple\",\"filter\":\"min_tf-1 min_df-1\",\"similarity\":\"cosine\"})|makezip'\n",
"\n",
"url = \"http://ws.clarin-pl.eu/nlprest2/base\" \n",
"\n",
"def upload(file):\n",
......@@ -60,7 +75,7 @@
"\n",
"def main():\n",
" in_file = 'corpus.zip'\n",
" out_file = 'out.json'\n",
" out_file = 'out.zip'\n",
" global_time = time.time()\n",
" fileid = upload(in_file)\n",
" print(\"Uploaded with id \" + fileid)\n",
......@@ -70,7 +85,7 @@
" print(\"Processing finished\")\n",
" if data != None:\n",
" data = data[0][\"fileID\"];\n",
" content = requests.get(url + '/download' + data + \"/weighted.json\");\n",
" content = requests.get(url + '/download' + data);\n",
" with open (out_file, \"wb\") as outfile:\n",
" outfile.write(content.content)\n",
" \n",
......@@ -111,7 +126,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
"version": "3.7.3"
}
},
"nbformat": 4,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment