{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# WebSty - stylometric features for Polish\n", "\n", "http://cmst.eu/wp-content/uploads/files/10.12921_cmst.2018.0000007_PIASECKI_c.pdf\n", "\n", "Results in - zip file with set of json files containing:\n", "<ul>\n", "<li> feature matrix: data (raw countes), weighted (for example by mi-simple) and transform (by default equal to weighted)</li><li> distances and similarities between documents - calculated from transform.json using similarity/dictance metrix (cosine)</li>\n", "\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Uploaded with id /users/default/7a46fb4f-1639-4726-8ca7-55eb4ed4df93\n", "Task ID: da11cb9d-d4a0-4869-bd73-88f7e67ce4e9\n", "[{'name': 'dane', 'fileID': '/requests/makezip/5e09f59e-d42b-4453-b71f-68579033b4e7'}]\n", "Processing finished\n", "File saved!\n", "GLOBAL 23.970514059066772 seconds ---\n" ] } ], "source": [ "import json\n", "import requests\n", "import glob\n", "import os\n", "import time\n", "\n", "user = \"mojadresemail@przyklad.pl\" \n", "\n", "#with document division into smaller parts - 20 kbytes\n", "task = 'any2txt|div(20000)|wcrft2|fextor2({\"features\":\"base interp_signs bigrams\",\"base_modification\":\"startlist\",\"orth_modification\":\"startlist\",\"lang\":\"pl\",\"filters\":{\"base\":[{\"type\":\"lemma_stoplist\",\"args\":{\"stoplist\":\"@resources/fextor/ml/polish_base_startlist.txt\"}}]}})|dir|featfilt2({\"weighting\":\"all:sm-mi_simple\",\"filter\":\"min_tf-1 min_df-1\",\"similarity\":\"cosine\"})|makezip'\n", "#whole documents\n", "task = 'any2txt|wcrft2|fextor2({\"features\":\"base interp_signs bigrams\",\"base_modification\":\"startlist\",\"orth_modification\":\"startlist\",\"lang\":\"pl\",\"filters\":{\"base\":[{\"type\":\"lemma_stoplist\",\"args\":{\"stoplist\":\"@resources/fextor/ml/polish_base_startlist.txt\"}}]}})|dir|featfilt2({\"weighting\":\"all:sm-mi_simple\",\"filter\":\"min_tf-1 min_df-1\",\"similarity\":\"cosine\"})|makezip'\n", "\n", "url = \"http://ws.clarin-pl.eu/nlprest2/base\" \n", "\n", "def upload(file):\n", " with open (file, \"rb\") as myfile:\n", " doc = myfile.read()\n", " res = requests.post(url + '/upload/', data=doc, headers={'Content-Type': 'binary/octet-stream'}) \n", " return res.content.decode('utf-8')\n", "\n", "def process(data):\n", " doc = json.dumps(data)\n", " taskid = requests.post(url + '/startTask/', doc, {'Content-Type': 'application/json'}).content.decode('utf-8')\n", " print(\"Task ID: \" + taskid)\n", " time.sleep(0.2)\n", " resp = requests.get(url + '/getStatus/' + taskid)\n", " \n", " data = json.loads(resp.content.decode('utf-8'))\n", " while data[\"status\"] == \"QUEUE\" or data[\"status\"] == \"PROCESSING\" :\n", " time.sleep(0.5);\n", " resp = requests.get(url + '/getStatus/' + taskid);\n", " data = json.loads(resp.content.decode('utf-8'))\n", " print(data[\"value\"],end='\\r')\n", " if data[\"status\"] == \"ERROR\":\n", " print(\"Error \" + data[\"value\"])\n", " return None\n", " print()\n", " return data[\"value\"]\n", "\n", "def main():\n", " in_file = 'corpus.zip'\n", " out_file = 'out.zip'\n", " global_time = time.time()\n", " fileid = upload(in_file)\n", " print(\"Uploaded with id \" + fileid)\n", " lpmn = 'filezip(' + fileid + ')|' + task\n", " data = {'lpmn': lpmn,'user': user}\n", " data = process(data)\n", " print(\"Processing finished\")\n", " if data != None:\n", " data = data[0][\"fileID\"];\n", " content = requests.get(url + '/download' + data);\n", " with open (out_file, \"wb\") as outfile:\n", " outfile.write(content.content)\n", " \n", " print(\"File saved!\")\n", " print(\"GLOBAL %s seconds ---\" % (time.time() - global_time))\n", "\n", "main()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }