#!/usr/bin/env python # -*- coding: utf-8 -*- import sys issues_accuracy = ['Terminology', 'Mistranslation', 'Omission', 'Addition', 'Untranslated', 'Accuracy'] issues_fluency = ['Agreement', 'Capitalization', 'Fluency', 'Function_words', 'Grammar', 'Morphology_(word_form)', 'Style/register', 'Typography', 'Unintelligible', 'Word_order', 'Tense/aspect/mood', 'Punctuation', 'Spelling', 'Part_of_speech'] def read_annotation(f): anno = {} for linenr, line in enumerate(f): line = line.strip().split() assert len(line) == 6, "line %d: expected 6 elements per line but found %d\n" %(linenr, len(line)) sid, wid, w, a1, a2, a3 = line sid = sid.split('.') assert len(sid) == 2 and sid[0].isdigit() and sid[1].isdigit(), \ "line %d: first entry (sentence id) must be if format X.Y\n" %linenr assert wid.isdigit(), "line %d: second entry (word index) must be integer\n" %linenr sid = (int(sid[0]), int(sid[1])) wid = int(wid) assert a1.lower() == "ok" or \ a1 in issues_accuracy or \ a1.lower() in map(str.lower, issues_accuracy) or \ a1 in issues_fluency or \ a1.lower() in map(str.lower, issues_fluency), \ "line %d: unexpected error category %s\n" %(linenr, a1) assert a2.lower() in ['ok', 'fluency', 'accuracy'], "line %d: unexpected error category %s\n" %(linenr, a2) assert a3.lower() in ['ok', 'bad'], "line %d: unexpected error category %s\n" %(linenr, a3) if not sid in anno: anno[sid] = {} assert not wid in anno[sid], "line %d: duplicate entry for s%d:w%d" %(linenr, sid, wid) if not wid in anno[sid]: anno[sid][wid] = [a1.lower(), a2.lower(), a3.lower()] return anno def compare_anno(anno1, anno2): for sid in anno1: assert sid in anno2, "s%d only found in one file\n" %(sid) for wid in anno1[sid]: assert wid in anno2[sid], "s%d:w%d only found in one file\n" %(sid, wid) def get_precision(tp, fp): if tp > 0: return float(tp)/(tp+fp) return 0. def get_recall(tp, fn): if tp > 0: return float(tp)/(tp+fn) return 0. def get_f1(tp, fn, fp): precision = get_precision(tp, fp) recall = get_recall(tp, fn) f1 = 0. if precision * recall > 0: f1 = 2. * precision*recall/(precision+recall) return f1 def matrix(n): return [[0]*n for i in range(n)] def eval_sub(anno1, anno2, idx, options, name): options = map(str.lower, options) short_options = [o[:7] for o in options] cf = matrix(len(options)) for sid in anno1: for wid in anno1[sid]: r = anno1[sid][wid][idx] r = options.index(r) p = anno2[sid][wid][idx] p = options.index(p) cf[p][r] += 1 print "----- Results for %s: -----" %name print "-------------------------------------" print "\tREFERENCE" print "PREDICT\t", "\t".join(short_options) for linenr, line in enumerate(cf): print "%s\t" %short_options[linenr], print "\t".join(map(str,line)) print "-------------------------------------" weighted_average_f1 = 0. norm = 0 for i in range(len(options)): tn, tp, fn, fp = 0.,0.,0.,0. tp = cf[i][i] fp = sum(cf[i]) - tp fn = sum(l[i] for l in cf) - tp f1 = get_f1(tp, fn, fp) if i != options.index('ok'): weighted_average_f1 += f1 * (tp + fn) norm += tp + fn print "F1 %24s: %f" %(options[i], f1) print "\n %24s: %f" %("WEIGHTED AVG", weighted_average_f1/norm) print "-------------------------------------" return weighted_average_f1/norm def eval_a1(anno1, anno2): options = ["ok"] + issues_fluency + issues_accuracy eval_sub(anno1, anno2, 0, options, "multiclass") def eval_a2(anno1, anno2): options = ['ok', 'fluency', 'accuracy'] #OPTIONS = map(str.lower, issues_fluency + issues_accuracy + ["OK"]) eval_sub(anno1, anno2, 1, options, "3-class") def eval_a3(anno1, anno2): options = ['ok', 'bad'] eval_sub(anno1, anno2, 2, options, "binary") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument('ref', help="reference annotations") parser.add_argument('sub', help="submission annotations") args = parser.parse_args(sys.argv[1:]) ref = read_annotation(open(args.ref)) submission = read_annotation(open(args.sub)) compare_anno(ref, submission) compare_anno(submission, ref) eval_a1(ref, submission) eval_a2(ref, submission) eval_a3(ref, submission)