Skip to content
Snippets Groups Projects
Commit ce4c9dad authored by Tomasz Walkowiak's avatar Tomasz Walkowiak
Browse files

notebook example

parent 655c0975
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
# WebSty - stylometric features for Polish
http://cmst.eu/wp-content/uploads/files/10.12921_cmst.2018.0000007_PIASECKI_c.pdf
Results in - zip file with set of json files containing:
<ul>
<li> feature matrix: data (raw countes), weighted (for example by mi-simple) and transform (by default equal to weighted)</li><li> distances and similarities between documents - calculated from transform.json using similarity/dictance metrix (cosine)</li>
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import json import json
import requests import requests
import glob import glob
import os import os
import time import time
user = "mojadresemail@przyklad.pl" user = "mojadresemail@przyklad.pl"
#with document division into smaller parts - 20 kbytes #with document division into smaller parts - 20 kbytes
task = 'any2txt|div(20000)|wcrft2|fextor2({"features":"base interp_signs bigrams","base_modification":"startlist","orth_modification":"startlist","lang":"pl","filters":{"base":[{"type":"lemma_stoplist","args":{"stoplist":"@resources/fextor/ml/polish_base_startlist.txt"}}]}})|dir|featfilt2({"weighting":"all:sm-mi_simple","filter":"min_tf-1 min_df-1","similarity":"cosine"})|makezip' task = 'any2txt|div(20000)|doc2vec({"lang":"polish")|dir|feature2({"type":"word2vec"})'
#whole documents #whole documents
task = 'any2txt|wcrft2|fextor2({"features":"base interp_signs bigrams","base_modification":"startlist","orth_modification":"startlist","lang":"pl","filters":{"base":[{"type":"lemma_stoplist","args":{"stoplist":"@resources/fextor/ml/polish_base_startlist.txt"}}]}})|dir|featfilt2({"weighting":"all:sm-mi_simple","filter":"min_tf-1 min_df-1","similarity":"cosine"})|makezip' task = 'any2txt|doc2vec({"lang":"polish"})|dir|feature2({"type":"word2vec"})'
url = "http://ws.clarin-pl.eu/nlprest2/base" url = "http://ws.clarin-pl.eu/nlprest2/base"
def upload(file): def upload(file):
with open (file, "rb") as myfile: with open (file, "rb") as myfile:
doc = myfile.read() doc = myfile.read()
res = requests.post(url + '/upload/', data=doc, headers={'Content-Type': 'binary/octet-stream'}) res = requests.post(url + '/upload/', data=doc, headers={'Content-Type': 'binary/octet-stream'})
return res.content.decode('utf-8') return res.content.decode('utf-8')
def process(data): def process(data):
doc = json.dumps(data) doc = json.dumps(data)
taskid = requests.post(url + '/startTask/', doc, {'Content-Type': 'application/json'}).content.decode('utf-8') taskid = requests.post(url + '/startTask/', doc, {'Content-Type': 'application/json'}).content.decode('utf-8')
print("Task ID: " + taskid) print("Task ID: " + taskid)
time.sleep(0.2) time.sleep(0.2)
resp = requests.get(url + '/getStatus/' + taskid) resp = requests.get(url + '/getStatus/' + taskid)
data = json.loads(resp.content.decode('utf-8')) data = json.loads(resp.content.decode('utf-8'))
while data["status"] == "QUEUE" or data["status"] == "PROCESSING" : while data["status"] == "QUEUE" or data["status"] == "PROCESSING" :
time.sleep(0.5); time.sleep(0.5);
resp = requests.get(url + '/getStatus/' + taskid); resp = requests.get(url + '/getStatus/' + taskid);
data = json.loads(resp.content.decode('utf-8')) data = json.loads(resp.content.decode('utf-8'))
print(data["value"],end='\r') print(data["value"],end='\r')
if data["status"] == "ERROR": if data["status"] == "ERROR":
print("Error " + data["value"]) print("Error " + data["value"])
return None return None
print() print()
return data["value"] return data["value"]
def main(): def main():
in_file = 'corpus.zip' in_file = 'corpus.zip'
out_file = 'out.zip' out_file = 'out.json'
global_time = time.time() global_time = time.time()
fileid = upload(in_file) fileid = upload(in_file)
print("Uploaded with id " + fileid) print("Uploaded with id " + fileid)
lpmn = 'filezip(' + fileid + ')|' + task lpmn = 'filezip(' + fileid + ')|' + task
data = {'lpmn': lpmn,'user': user} data = {'lpmn': lpmn,'user': user}
data = process(data) data = process(data)
print("Processing finished") print("Processing finished")
if data != None: if data != None:
data = data[0]["fileID"]; data = data[0]["fileID"];
content = requests.get(url + '/download' + data); content = requests.get(url + '/download' + data + "/weighted.json");
with open (out_file, "wb") as outfile: with open (out_file, "wb") as outfile:
outfile.write(content.content) outfile.write(content.content)
print("File saved!") print("File saved!")
print("GLOBAL %s seconds ---" % (time.time() - global_time)) print("GLOBAL %s seconds ---" % (time.time() - global_time))
main() main()
``` ```
%% Output %% Output
Uploaded with id /users/default/7a46fb4f-1639-4726-8ca7-55eb4ed4df93 Uploaded with id /users/default/e7064a0d-c756-4d3e-8a2c-4b0b72143f51
Task ID: da11cb9d-d4a0-4869-bd73-88f7e67ce4e9 Task ID: bca13a9d-ccf3-41cb-978f-68327b606e5a
[{'name': 'dane', 'fileID': '/requests/makezip/5e09f59e-d42b-4453-b71f-68579033b4e7'}] [{'name': 'dane', 'fileID': '/requests/feature2/932cf156-c45a-407f-8434-a2a5c7efbcbe'}]
Processing finished Processing finished
File saved! File saved!
GLOBAL 23.970514059066772 seconds --- GLOBAL 3.3759539127349854 seconds ---
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
``` ```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment