Michał Marcińczuk · 17ab858a · 17ab858a · 90deccb4 · 17ab858a · 17ab858a
--- a/core/utils/data_utils.py

+ 27

− 18

View file @ 17ab858a
+++ b/core/utils/data_utils.py

+ 27

− 18

View file @ 17ab858a
 @@ -311,12 +311,22 @@ def convert_examples_to_features_nosq(examples, label_list, max_seq_length, enco
        assert len(valid) == max_seq_length
        assert len(label_mask) == max_seq_length

-        features.append(
-            InputFeatures(input_ids=token_ids,
-                          input_mask=input_mask,
-                          label_id=label_ids,
-                          valid_ids=valid,
-                          label_mask=label_mask))
+        if ex_index < 2:
+            logging.debug("*** Example ***")
+            logging.debug("guid: %s" % example.guid)
+            logging.debug("tokens: %s" % " ".join([str(x) for x in token_ids]))
+            logging.debug("input_ids: %s" % " ".join([str(x) for x in token_ids]))
+            logging.debug("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+            logging.debug("label: %s (id = %s)" % (example.label, " ".join(map(str, label_ids))))
+            logging.debug("label_mask: %s" % " ".join([str(x) for x in label_mask]))
+            logging.debug("valid mask: %s" % " ".join([str(x) for x in valid]))
+        features.append(InputFeatures(input_ids=token_ids,
+                                      input_mask=input_mask,
+                                      label_id=label_ids,
+                                      valid_ids=valid,
+                                      label_mask=label_mask))
+
+
    return features


 @@ -463,17 +473,15 @@ def read_tsv(filename, with_labels=False):
    return data


-def save_tsv(filename, outfilename, predictions):
-    flat_predictions = [item for sublist in predictions for item in sublist]
-    i = 0
-    with codecs.open(outfilename, "w", "utf8") as fout:
-        for line in codecs.open(filename, "r", "utf-8"):
-            cols = line.split("\t")
-            if len(line.strip()) == 0 or line.startswith('-DOCSTART'):
-                fout.write(line)
-            else:
-                fout.write('%s\t%s\n' % (cols[0], flat_predictions[i]))
-                i += 1
+
+def save_tsv(output_path, sentences, predictions):
+    with codecs.open(output_path, "w", "utf8") as fout:
+        assert len(sentences) == len(predictions)
+        for tokens, labels in zip(sentences, predictions):
+            for token, label in zip(tokens, labels):
+                fout.write(f'{token}\t{label}\n')
+            fout.write("\n")
+


 def get_dict_for_record(json_ann):
 @@ -483,7 +491,8 @@ def get_dict_for_record(json_ann):
        if ann.find('derivType') < 0:
            if ann.strip() != '':
                annotation = ann.split('\t')[1].split(' ')[0]
-                token =  ann.split('\t')[-1]
+                token = ann.split('\t')[-1]
+
                if token in token_dict.keys():
                    token_dict[token] = ''.join([token_dict[token],'#',annotation])
                else: