diff --git a/documentation/Doc2vec.ipynb b/documentation/stylo.ipynb similarity index 62% rename from documentation/Doc2vec.ipynb rename to documentation/stylo.ipynb index 8b8dcfa55a6a30b628ef70ae2fc9927cd0af351c..202d846072ed604dd31c8b57919fbe13929afb96 100644 --- a/documentation/Doc2vec.ipynb +++ b/documentation/stylo.ipynb @@ -1,20 +1,34 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# WebSty - stylometric features for Polish\n", + "\n", + "http://cmst.eu/wp-content/uploads/files/10.12921_cmst.2018.0000007_PIASECKI_c.pdf\n", + "\n", + "Results in - zip file with set of json files containing:\n", + "<ul>\n", + "<li> feature matrix: data (raw countes), weighted (for example by mi-simple) and transform (by default equal to weighted)</li><li> distances and similarities between documents - calculated from transform.json using similarity/dictance metrix (cosine)</li>\n", + "\n" + ] + }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Uploaded with id /users/default/e7064a0d-c756-4d3e-8a2c-4b0b72143f51\n", - "Task ID: bca13a9d-ccf3-41cb-978f-68327b606e5a\n", - "[{'name': 'dane', 'fileID': '/requests/feature2/932cf156-c45a-407f-8434-a2a5c7efbcbe'}]\n", + "Uploaded with id /users/default/7a46fb4f-1639-4726-8ca7-55eb4ed4df93\n", + "Task ID: da11cb9d-d4a0-4869-bd73-88f7e67ce4e9\n", + "[{'name': 'dane', 'fileID': '/requests/makezip/5e09f59e-d42b-4453-b71f-68579033b4e7'}]\n", "Processing finished\n", "File saved!\n", - "GLOBAL 3.3759539127349854 seconds ---\n" + "GLOBAL 23.970514059066772 seconds ---\n" ] } ], @@ -28,9 +42,10 @@ "user = \"mojadresemail@przyklad.pl\" \n", "\n", "#with document division into smaller parts - 20 kbytes\n", - "task = 'any2txt|div(20000)|doc2vec({\"lang\":\"polish\")|dir|feature2({\"type\":\"word2vec\"})'\n", + "task = 'any2txt|div(20000)|wcrft2|fextor2({\"features\":\"base interp_signs bigrams\",\"base_modification\":\"startlist\",\"orth_modification\":\"startlist\",\"lang\":\"pl\",\"filters\":{\"base\":[{\"type\":\"lemma_stoplist\",\"args\":{\"stoplist\":\"@resources/fextor/ml/polish_base_startlist.txt\"}}]}})|dir|featfilt2({\"weighting\":\"all:sm-mi_simple\",\"filter\":\"min_tf-1 min_df-1\",\"similarity\":\"cosine\"})|makezip'\n", "#whole documents\n", - "task = 'any2txt|doc2vec({\"lang\":\"polish\"})|dir|feature2({\"type\":\"word2vec\"})'\n", + "task = 'any2txt|wcrft2|fextor2({\"features\":\"base interp_signs bigrams\",\"base_modification\":\"startlist\",\"orth_modification\":\"startlist\",\"lang\":\"pl\",\"filters\":{\"base\":[{\"type\":\"lemma_stoplist\",\"args\":{\"stoplist\":\"@resources/fextor/ml/polish_base_startlist.txt\"}}]}})|dir|featfilt2({\"weighting\":\"all:sm-mi_simple\",\"filter\":\"min_tf-1 min_df-1\",\"similarity\":\"cosine\"})|makezip'\n", + "\n", "url = \"http://ws.clarin-pl.eu/nlprest2/base\" \n", "\n", "def upload(file):\n", @@ -60,7 +75,7 @@ "\n", "def main():\n", " in_file = 'corpus.zip'\n", - " out_file = 'out.json'\n", + " out_file = 'out.zip'\n", " global_time = time.time()\n", " fileid = upload(in_file)\n", " print(\"Uploaded with id \" + fileid)\n", @@ -70,7 +85,7 @@ " print(\"Processing finished\")\n", " if data != None:\n", " data = data[0][\"fileID\"];\n", - " content = requests.get(url + '/download' + data + \"/weighted.json\");\n", + " content = requests.get(url + '/download' + data);\n", " with open (out_file, \"wb\") as outfile:\n", " outfile.write(content.content)\n", " \n", @@ -111,7 +126,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.7.3" } }, "nbformat": 4,