Skip to content
Snippets Groups Projects
Commit add37596 authored by Tomasz Walkowiak's avatar Tomasz Walkowiak
Browse files

notebook example

parent ce4c9dad
No related merge requests found
%% Cell type:markdown id: tags:
# WebSty - stylometric features for Polish
http://cmst.eu/wp-content/uploads/files/10.12921_cmst.2018.0000007_PIASECKI_c.pdf
Results in - zip file with set of json files containing:
<ul>
<li> feature matrix: data (raw countes), weighted (for example by mi-simple) and transform (by default equal to weighted)</li><li> distances and similarities between documents - calculated from transform.json using similarity/dictance metrix (cosine)</li>
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import json import json
import requests import requests
import glob import glob
import os import os
import time import time
user = "mojadresemail@przyklad.pl" user = "mojadresemail@przyklad.pl"
#with document division into smaller parts - 20 kbytes #with document division into smaller parts - 20 kbytes
task = 'any2txt|div(20000)|doc2vec({"lang":"polish")|dir|feature2({"type":"word2vec"})' task = 'any2txt|div(20000)|wcrft2|fextor2({"features":"base interp_signs bigrams","base_modification":"startlist","orth_modification":"startlist","lang":"pl","filters":{"base":[{"type":"lemma_stoplist","args":{"stoplist":"@resources/fextor/ml/polish_base_startlist.txt"}}]}})|dir|featfilt2({"weighting":"all:sm-mi_simple","filter":"min_tf-1 min_df-1","similarity":"cosine"})|makezip'
#whole documents #whole documents
task = 'any2txt|doc2vec({"lang":"polish"})|dir|feature2({"type":"word2vec"})' task = 'any2txt|wcrft2|fextor2({"features":"base interp_signs bigrams","base_modification":"startlist","orth_modification":"startlist","lang":"pl","filters":{"base":[{"type":"lemma_stoplist","args":{"stoplist":"@resources/fextor/ml/polish_base_startlist.txt"}}]}})|dir|featfilt2({"weighting":"all:sm-mi_simple","filter":"min_tf-1 min_df-1","similarity":"cosine"})|makezip'
url = "http://ws.clarin-pl.eu/nlprest2/base" url = "http://ws.clarin-pl.eu/nlprest2/base"
def upload(file): def upload(file):
with open (file, "rb") as myfile: with open (file, "rb") as myfile:
doc = myfile.read() doc = myfile.read()
res = requests.post(url + '/upload/', data=doc, headers={'Content-Type': 'binary/octet-stream'}) res = requests.post(url + '/upload/', data=doc, headers={'Content-Type': 'binary/octet-stream'})
return res.content.decode('utf-8') return res.content.decode('utf-8')
def process(data): def process(data):
doc = json.dumps(data) doc = json.dumps(data)
taskid = requests.post(url + '/startTask/', doc, {'Content-Type': 'application/json'}).content.decode('utf-8') taskid = requests.post(url + '/startTask/', doc, {'Content-Type': 'application/json'}).content.decode('utf-8')
print("Task ID: " + taskid) print("Task ID: " + taskid)
time.sleep(0.2) time.sleep(0.2)
resp = requests.get(url + '/getStatus/' + taskid) resp = requests.get(url + '/getStatus/' + taskid)
data = json.loads(resp.content.decode('utf-8')) data = json.loads(resp.content.decode('utf-8'))
while data["status"] == "QUEUE" or data["status"] == "PROCESSING" : while data["status"] == "QUEUE" or data["status"] == "PROCESSING" :
time.sleep(0.5); time.sleep(0.5);
resp = requests.get(url + '/getStatus/' + taskid); resp = requests.get(url + '/getStatus/' + taskid);
data = json.loads(resp.content.decode('utf-8')) data = json.loads(resp.content.decode('utf-8'))
print(data["value"],end='\r') print(data["value"],end='\r')
if data["status"] == "ERROR": if data["status"] == "ERROR":
print("Error " + data["value"]) print("Error " + data["value"])
return None return None
print() print()
return data["value"] return data["value"]
def main(): def main():
in_file = 'corpus.zip' in_file = 'corpus.zip'
out_file = 'out.json' out_file = 'out.zip'
global_time = time.time() global_time = time.time()
fileid = upload(in_file) fileid = upload(in_file)
print("Uploaded with id " + fileid) print("Uploaded with id " + fileid)
lpmn = 'filezip(' + fileid + ')|' + task lpmn = 'filezip(' + fileid + ')|' + task
data = {'lpmn': lpmn,'user': user} data = {'lpmn': lpmn,'user': user}
data = process(data) data = process(data)
print("Processing finished") print("Processing finished")
if data != None: if data != None:
data = data[0]["fileID"]; data = data[0]["fileID"];
content = requests.get(url + '/download' + data + "/weighted.json"); content = requests.get(url + '/download' + data);
with open (out_file, "wb") as outfile: with open (out_file, "wb") as outfile:
outfile.write(content.content) outfile.write(content.content)
print("File saved!") print("File saved!")
print("GLOBAL %s seconds ---" % (time.time() - global_time)) print("GLOBAL %s seconds ---" % (time.time() - global_time))
main() main()
``` ```
%% Output %% Output
Uploaded with id /users/default/e7064a0d-c756-4d3e-8a2c-4b0b72143f51 Uploaded with id /users/default/7a46fb4f-1639-4726-8ca7-55eb4ed4df93
Task ID: bca13a9d-ccf3-41cb-978f-68327b606e5a Task ID: da11cb9d-d4a0-4869-bd73-88f7e67ce4e9
[{'name': 'dane', 'fileID': '/requests/feature2/932cf156-c45a-407f-8434-a2a5c7efbcbe'}] [{'name': 'dane', 'fileID': '/requests/makezip/5e09f59e-d42b-4453-b71f-68579033b4e7'}]
Processing finished Processing finished
File saved! File saved!
GLOBAL 3.3759539127349854 seconds --- GLOBAL 23.970514059066772 seconds ---
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
``` ```
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment