Commit 655c0975 authored by Tomasz Walkowiak's avatar Tomasz Walkowiak

notebook example

parent 97891577
[service]
tool = featfilt2_test
tool = featfilt2
root = /samba/requests/
rabbit_host = rabbit.clarin.ws
......
venv/
*.json
.ipynb_checkpoints/
Aby uruchomić przykład należy wykonać poniższy ciąg komend:
1. `virtualenv -p python3.6 venv`
2. `. venv/bin/activate`
3. `pip install -r requirements.txt`
4. `jupyter notebook`
Po wykonaniu ostatniej komendy w przeglądarce powinien pojawić się interaktywny
przykład wykonania polecenia w notacji LPMN
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# WebSty - stylometric features for Polish\n",
"\n",
"http://cmst.eu/wp-content/uploads/files/10.12921_cmst.2018.0000007_PIASECKI_c.pdf\n",
"\n",
"Results in - zip file with set of json files containing:\n",
"<ul>\n",
"<li> feature matrix: data (raw countes), weighted (for example by mi-simple) and transform (by default equal to weighted)</li><li> distances and similarities between documents - calculated from transform.json using similarity/dictance metrix (cosine)</li>\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Uploaded with id /users/default/7a46fb4f-1639-4726-8ca7-55eb4ed4df93\n",
"Task ID: da11cb9d-d4a0-4869-bd73-88f7e67ce4e9\n",
"[{'name': 'dane', 'fileID': '/requests/makezip/5e09f59e-d42b-4453-b71f-68579033b4e7'}]\n",
"Processing finished\n",
"File saved!\n",
"GLOBAL 23.970514059066772 seconds ---\n"
]
}
],
"source": [
"import json\n",
"import requests\n",
"import glob\n",
"import os\n",
"import time\n",
"\n",
"user = \"mojadresemail@przyklad.pl\" \n",
"\n",
"#with document division into smaller parts - 20 kbytes\n",
"task = 'any2txt|div(20000)|wcrft2|fextor2({\"features\":\"base interp_signs bigrams\",\"base_modification\":\"startlist\",\"orth_modification\":\"startlist\",\"lang\":\"pl\",\"filters\":{\"base\":[{\"type\":\"lemma_stoplist\",\"args\":{\"stoplist\":\"@resources/fextor/ml/polish_base_startlist.txt\"}}]}})|dir|featfilt2({\"weighting\":\"all:sm-mi_simple\",\"filter\":\"min_tf-1 min_df-1\",\"similarity\":\"cosine\"})|makezip'\n",
"#whole documents\n",
"task = 'any2txt|wcrft2|fextor2({\"features\":\"base interp_signs bigrams\",\"base_modification\":\"startlist\",\"orth_modification\":\"startlist\",\"lang\":\"pl\",\"filters\":{\"base\":[{\"type\":\"lemma_stoplist\",\"args\":{\"stoplist\":\"@resources/fextor/ml/polish_base_startlist.txt\"}}]}})|dir|featfilt2({\"weighting\":\"all:sm-mi_simple\",\"filter\":\"min_tf-1 min_df-1\",\"similarity\":\"cosine\"})|makezip'\n",
"\n",
"url = \"http://ws.clarin-pl.eu/nlprest2/base\" \n",
"\n",
"def upload(file):\n",
" with open (file, \"rb\") as myfile:\n",
" doc = myfile.read()\n",
" res = requests.post(url + '/upload/', data=doc, headers={'Content-Type': 'binary/octet-stream'}) \n",
" return res.content.decode('utf-8')\n",
"\n",
"def process(data):\n",
" doc = json.dumps(data)\n",
" taskid = requests.post(url + '/startTask/', doc, {'Content-Type': 'application/json'}).content.decode('utf-8')\n",
" print(\"Task ID: \" + taskid)\n",
" time.sleep(0.2)\n",
" resp = requests.get(url + '/getStatus/' + taskid)\n",
" \n",
" data = json.loads(resp.content.decode('utf-8'))\n",
" while data[\"status\"] == \"QUEUE\" or data[\"status\"] == \"PROCESSING\" :\n",
" time.sleep(0.5);\n",
" resp = requests.get(url + '/getStatus/' + taskid);\n",
" data = json.loads(resp.content.decode('utf-8'))\n",
" print(data[\"value\"],end='\\r')\n",
" if data[\"status\"] == \"ERROR\":\n",
" print(\"Error \" + data[\"value\"])\n",
" return None\n",
" print()\n",
" return data[\"value\"]\n",
"\n",
"def main():\n",
" in_file = 'corpus.zip'\n",
" out_file = 'out.zip'\n",
" global_time = time.time()\n",
" fileid = upload(in_file)\n",
" print(\"Uploaded with id \" + fileid)\n",
" lpmn = 'filezip(' + fileid + ')|' + task\n",
" data = {'lpmn': lpmn,'user': user}\n",
" data = process(data)\n",
" print(\"Processing finished\")\n",
" if data != None:\n",
" data = data[0][\"fileID\"];\n",
" content = requests.get(url + '/download' + data);\n",
" with open (out_file, \"wb\") as outfile:\n",
" outfile.write(content.content)\n",
" \n",
" print(\"File saved!\")\n",
" print(\"GLOBAL %s seconds ---\" % (time.time() - global_time))\n",
"\n",
"main()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment