{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# WebSty - stylometric features for Polish\n",
    "\n",
    "http://cmst.eu/wp-content/uploads/files/10.12921_cmst.2018.0000007_PIASECKI_c.pdf\n",
    "\n",
    "Results in  - zip file with set of  json files containing:\n",
    "<ul>\n",
    "<li> feature matrix: data (raw countes), weighted (for example by mi-simple) and transform (by default equal to weighted)</li><li> distances and similarities between documents  - calculated from transform.json using similarity/dictance metrix (cosine)</li>\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Uploaded with id /users/default/7a46fb4f-1639-4726-8ca7-55eb4ed4df93\n",
      "Task ID: da11cb9d-d4a0-4869-bd73-88f7e67ce4e9\n",
      "[{'name': 'dane', 'fileID': '/requests/makezip/5e09f59e-d42b-4453-b71f-68579033b4e7'}]\n",
      "Processing finished\n",
      "File saved!\n",
      "GLOBAL 23.970514059066772 seconds ---\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import requests\n",
    "import glob\n",
    "import os\n",
    "import time\n",
    "\n",
    "user = \"mojadresemail@przyklad.pl\" \n",
    "\n",
    "#with document division into smaller parts - 20 kbytes\n",
    "task = 'any2txt|div(20000)|wcrft2|fextor2({\"features\":\"base interp_signs bigrams\",\"base_modification\":\"startlist\",\"orth_modification\":\"startlist\",\"lang\":\"pl\",\"filters\":{\"base\":[{\"type\":\"lemma_stoplist\",\"args\":{\"stoplist\":\"@resources/fextor/ml/polish_base_startlist.txt\"}}]}})|dir|featfilt2({\"weighting\":\"all:sm-mi_simple\",\"filter\":\"min_tf-1 min_df-1\",\"similarity\":\"cosine\"})|makezip'\n",
    "#whole documents\n",
    "task = 'any2txt|wcrft2|fextor2({\"features\":\"base interp_signs bigrams\",\"base_modification\":\"startlist\",\"orth_modification\":\"startlist\",\"lang\":\"pl\",\"filters\":{\"base\":[{\"type\":\"lemma_stoplist\",\"args\":{\"stoplist\":\"@resources/fextor/ml/polish_base_startlist.txt\"}}]}})|dir|featfilt2({\"weighting\":\"all:sm-mi_simple\",\"filter\":\"min_tf-1 min_df-1\",\"similarity\":\"cosine\"})|makezip'\n",
    "\n",
    "url = \"http://ws.clarin-pl.eu/nlprest2/base\" \n",
    "\n",
    "def upload(file):\n",
    "        with open (file, \"rb\") as myfile:\n",
    "            doc = myfile.read()\n",
    "        res = requests.post(url + '/upload/', data=doc, headers={'Content-Type': 'binary/octet-stream'})    \n",
    "        return res.content.decode('utf-8')\n",
    "\n",
    "def process(data):\n",
    "        doc = json.dumps(data)\n",
    "        taskid = requests.post(url + '/startTask/', doc, {'Content-Type': 'application/json'}).content.decode('utf-8')\n",
    "        print(\"Task ID: \" + taskid)\n",
    "        time.sleep(0.2)\n",
    "        resp = requests.get(url + '/getStatus/' + taskid)\n",
    "        \n",
    "        data = json.loads(resp.content.decode('utf-8'))\n",
    "        while data[\"status\"] == \"QUEUE\" or data[\"status\"] == \"PROCESSING\" :\n",
    "            time.sleep(0.5);\n",
    "            resp = requests.get(url + '/getStatus/' + taskid);\n",
    "            data = json.loads(resp.content.decode('utf-8'))\n",
    "            print(data[\"value\"],end='\\r')\n",
    "        if data[\"status\"] == \"ERROR\":\n",
    "            print(\"Error \" + data[\"value\"])\n",
    "            return None\n",
    "        print()\n",
    "        return data[\"value\"]\n",
    "\n",
    "def main():\n",
    "    in_file = 'corpus.zip'\n",
    "    out_file = 'out.zip'\n",
    "    global_time = time.time()\n",
    "    fileid = upload(in_file)\n",
    "    print(\"Uploaded with id \" + fileid)\n",
    "    lpmn = 'filezip(' + fileid + ')|' + task\n",
    "    data = {'lpmn': lpmn,'user': user}\n",
    "    data = process(data)\n",
    "    print(\"Processing finished\")\n",
    "    if data != None:\n",
    "        data = data[0][\"fileID\"];\n",
    "        content = requests.get(url + '/download' + data);\n",
    "        with open (out_file, \"wb\") as outfile:\n",
    "                outfile.write(content.content)\n",
    "                \n",
    "    print(\"File saved!\")\n",
    "    print(\"GLOBAL %s seconds ---\" % (time.time() - global_time))\n",
    "\n",
    "main()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}