notebook example

add37596 · Tomasz Walkowiak · ce4c9dad · add37596
Commit add37596 authored 5 years ago by Tomasz Walkowiak
--- a/documentation/Doc2vec.ipynb
+++ b/documentation/Doc2vec.ipynb
 {
 "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# WebSty - stylometric features for Polish\n",
+    "\n",
+    "http://cmst.eu/wp-content/uploads/files/10.12921_cmst.2018.0000007_PIASECKI_c.pdf\n",
+    "\n",
+    "Results in  - zip file with set of  json files containing:\n",
+    "<ul>\n",
+    "<li> feature matrix: data (raw countes), weighted (for example by mi-simple) and transform (by default equal to weighted)</li><li> distances and similarities between documents  - calculated from transform.json using similarity/dictance metrix (cosine)</li>\n",
+    "\n"
+   ]
+  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Uploaded with id /users/default/e7064a0d-c756-4d3e-8a2c-4b0b72143f51\n",
+      "Uploaded with id /users/default/7a46fb4f-1639-4726-8ca7-55eb4ed4df93\n",
-      "Task ID: bca13a9d-ccf3-41cb-978f-68327b606e5a\n",
+      "Task ID: da11cb9d-d4a0-4869-bd73-88f7e67ce4e9\n",
-      "[{'name': 'dane', 'fileID': '/requests/feature2/932cf156-c45a-407f-8434-a2a5c7efbcbe'}]\n",
+      "[{'name': 'dane', 'fileID': '/requests/makezip/5e09f59e-d42b-4453-b71f-68579033b4e7'}]\n",
      "Processing finished\n",
      "File saved!\n",
-      "GLOBAL 3.3759539127349854 seconds ---\n"
+      "GLOBAL 23.970514059066772 seconds ---\n"
     ]
    }
   ],
@@ -28,9 +42,10 @@
    "user = \"mojadresemail@przyklad.pl\" \n",
    "\n",
    "#with document division into smaller parts - 20 kbytes\n",
-    "task = 'any2txt|div(20000)|doc2vec({\"lang\":\"polish\")|dir|feature2({\"type\":\"word2vec\"})'\n",
+    "task = 'any2txt|div(20000)|wcrft2|fextor2({\"features\":\"base interp_signs bigrams\",\"base_modification\":\"startlist\",\"orth_modification\":\"startlist\",\"lang\":\"pl\",\"filters\":{\"base\":[{\"type\":\"lemma_stoplist\",\"args\":{\"stoplist\":\"@resources/fextor/ml/polish_base_startlist.txt\"}}]}})|dir|featfilt2({\"weighting\":\"all:sm-mi_simple\",\"filter\":\"min_tf-1 min_df-1\",\"similarity\":\"cosine\"})|makezip'\n",
    "#whole documents\n",
-    "task = 'any2txt|doc2vec({\"lang\":\"polish\"})|dir|feature2({\"type\":\"word2vec\"})'\n",
+    "task = 'any2txt|wcrft2|fextor2({\"features\":\"base interp_signs bigrams\",\"base_modification\":\"startlist\",\"orth_modification\":\"startlist\",\"lang\":\"pl\",\"filters\":{\"base\":[{\"type\":\"lemma_stoplist\",\"args\":{\"stoplist\":\"@resources/fextor/ml/polish_base_startlist.txt\"}}]}})|dir|featfilt2({\"weighting\":\"all:sm-mi_simple\",\"filter\":\"min_tf-1 min_df-1\",\"similarity\":\"cosine\"})|makezip'\n",
+    "\n",
    "url = \"http://ws.clarin-pl.eu/nlprest2/base\" \n",
    "\n",
    "def upload(file):\n",
@@ -60,7 +75,7 @@
    "\n",
    "def main():\n",
    "    in_file = 'corpus.zip'\n",
-    "    out_file = 'out.json'\n",
+    "    out_file = 'out.zip'\n",
    "    global_time = time.time()\n",
    "    fileid = upload(in_file)\n",
    "    print(\"Uploaded with id \" + fileid)\n",
@@ -70,7 +85,7 @@
    "    print(\"Processing finished\")\n",
    "    if data != None:\n",
    "        data = data[0][\"fileID\"];\n",
-    "        content = requests.get(url + '/download' + data + \"/weighted.json\");\n",
+    "        content = requests.get(url + '/download' + data);\n",
    "        with open (out_file, \"wb\") as outfile:\n",
    "                outfile.write(content.content)\n",
    "                \n",
@@ -111,7 +126,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.4"
+   "version": "3.7.3"
  }
 },
 "nbformat": 4,

+%% Cell type:markdown id: tags:
+# WebSty - stylometric features for Polish
+http://cmst.eu/wp-content/uploads/files/10.12921_cmst.2018.0000007_PIASECKI_c.pdf
+Results in  - zip file with set of  json files containing:
+<ul>
+<li> feature matrix: data (raw countes), weighted (for example by mi-simple) and transform (by default equal to weighted)</li><li> distances and similarities between documents  - calculated from transform.json using similarity/dictance metrix (cosine)</li>
 %% Cell type:code id: tags:
 ``` python
 import json
 import requests
 import glob
 import os
 import time
 user = "mojadresemail@przyklad.pl"
 #with document division into smaller parts - 20 kbytes
-task = 'any2txt|div(20000)|doc2vec({"lang":"polish")|dir|feature2({"type":"word2vec"})'
+task = 'any2txt|div(20000)|wcrft2|fextor2({"features":"base interp_signs bigrams","base_modification":"startlist","orth_modification":"startlist","lang":"pl","filters":{"base":[{"type":"lemma_stoplist","args":{"stoplist":"@resources/fextor/ml/polish_base_startlist.txt"}}]}})|dir|featfilt2({"weighting":"all:sm-mi_simple","filter":"min_tf-1 min_df-1","similarity":"cosine"})|makezip'
 #whole documents
-task = 'any2txt|doc2vec({"lang":"polish"})|dir|feature2({"type":"word2vec"})'
+task = 'any2txt|wcrft2|fextor2({"features":"base interp_signs bigrams","base_modification":"startlist","orth_modification":"startlist","lang":"pl","filters":{"base":[{"type":"lemma_stoplist","args":{"stoplist":"@resources/fextor/ml/polish_base_startlist.txt"}}]}})|dir|featfilt2({"weighting":"all:sm-mi_simple","filter":"min_tf-1 min_df-1","similarity":"cosine"})|makezip'
 url = "http://ws.clarin-pl.eu/nlprest2/base"
 def upload(file):
        with open (file, "rb") as myfile:
            doc = myfile.read()
        res = requests.post(url + '/upload/', data=doc, headers={'Content-Type': 'binary/octet-stream'})
        return res.content.decode('utf-8')
 def process(data):
        doc = json.dumps(data)
        taskid = requests.post(url + '/startTask/', doc, {'Content-Type': 'application/json'}).content.decode('utf-8')
        print("Task ID: " + taskid)
        time.sleep(0.2)
        resp = requests.get(url + '/getStatus/' + taskid)
        data = json.loads(resp.content.decode('utf-8'))
        while data["status"] == "QUEUE" or data["status"] == "PROCESSING" :
            time.sleep(0.5);
            resp = requests.get(url + '/getStatus/' + taskid);
            data = json.loads(resp.content.decode('utf-8'))
            print(data["value"],end='\r')
        if data["status"] == "ERROR":
            print("Error " + data["value"])
            return None
        print()
        return data["value"]
 def main():
    in_file = 'corpus.zip'
-    out_file = 'out.json'
+    out_file = 'out.zip'
    global_time = time.time()
    fileid = upload(in_file)
    print("Uploaded with id " + fileid)
    lpmn = 'filezip(' + fileid + ')|' + task
    data = {'lpmn': lpmn,'user': user}
    data = process(data)
    print("Processing finished")
    if data != None:
        data = data[0]["fileID"];
-        content = requests.get(url + '/download' + data + "/weighted.json");
+        content = requests.get(url + '/download' + data);
        with open (out_file, "wb") as outfile:
                outfile.write(content.content)
    print("File saved!")
    print("GLOBAL %s seconds ---" % (time.time() - global_time))
 main()
 ```
 %% Output
-    Uploaded with id /users/default/e7064a0d-c756-4d3e-8a2c-4b0b72143f51
+    Uploaded with id /users/default/7a46fb4f-1639-4726-8ca7-55eb4ed4df93
-    Task ID: bca13a9d-ccf3-41cb-978f-68327b606e5a
+    Task ID: da11cb9d-d4a0-4869-bd73-88f7e67ce4e9
-    [{'name': 'dane', 'fileID': '/requests/feature2/932cf156-c45a-407f-8434-a2a5c7efbcbe'}]
+    [{'name': 'dane', 'fileID': '/requests/makezip/5e09f59e-d42b-4453-b71f-68579033b4e7'}]
    Processing finished
    File saved!
-    GLOBAL 3.3759539127349854 seconds ---
+    GLOBAL 23.970514059066772 seconds ---
 %% Cell type:code id: tags:
 ``` python
 ```
 %% Cell type:code id: tags:
 ``` python
 ```