Skip to content
Snippets Groups Projects
Commit 7acf5a30 authored by Michał Marcińczuk's avatar Michał Marcińczuk
Browse files

Merge branch 'dev' into kgr10_roberta

parents 87718460 981ab3aa
Branches kgr10_roberta
No related merge requests found
Pipeline #2868 passed with stage
in 4 minutes and 52 seconds
......@@ -20,13 +20,21 @@ def main(args):
sentences = [sentence[0] for sentence in sentences_labels]
labels = [sentence[1] for sentence in sentences_labels]
time_start = time.clock()
data_size = 0
for sentence in sentences:
data_size += sum([len(token) + 1 for token in sentence])
t0 = time.clock()
predictions = ner.process(sentences)
time_end = time.clock()
time_processing = time.clock() - t0
report = classification_report(labels, predictions, digits=4)
print(report)
print("Processing time: %d second(s)" % (time_end-time_start))
print(f"Total time : {time_processing:>8.4} second(s)")
print(f"Data size: : {data_size/1000000:>8.4} M characters")
print(f"Speed: : {data_size / 1000000 / (time_processing/60):>8.4} M characters/minute")
print(f"Number of token labels : {len(ner.label_list):>8} ")
def parse_args():
......@@ -39,7 +47,7 @@ def parse_args():
help='the maximum total input sequence length after WordPiece tokenization.')
parser.add_argument('--device', required=False, default="cpu", metavar='cpu|cuda',
help='device type used for processing')
parser.add_argument('--squeeze', required=False, default=False,
parser.add_argument('--squeeze', required=False, default=False, action="store_true",
help='try to squeeze multiple examples into one Input Feature')
return parser.parse_args()
......
......@@ -95,7 +95,7 @@ def main():
logger.info("Loading pretrained model...")
t0 = time.time()
if args.pretrained_path.startswith("automodel:"):
if args.pretrained_path.startswith("hf:"):
from poldeepner2.model.herbert_for_token_calssification import AutoTokenizerForTokenClassification
pretrained_dir = args.pretrained_path.split(':')[1]
model = AutoTokenizerForTokenClassification(
......
import sys, json, getopt
from tqdm import tqdm
from attr import dataclass
from dateutil import parser
......@@ -130,9 +131,8 @@ def computeScores(goldfile, userfile, cn: CategoryNormalizer, htype="split", typ
with open(goldfile) as json_data:
goldjson = json.load(json_data)
for nr in range(len(goldjson['questions'])):
for nr in tqdm(range(len(goldjson['questions']))):
idGold = '/'.join(goldjson['questions'][nr]['input']['fname'].split('/')[4:])
print(idGold)
if idGold in idsToAnnsUser:
found += 1
# find the most recent answer:
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment