#!/usr/bin/env python import csv import codecs import collections import poioapi.io.graf import poioapi.annotationgraph import poioapi.data class WrongAnnotationCount(Exception): pass WordOrder = collections.namedtuple("WordOrder", "clause_id word_order clause_type agreement") def from_excel(filepath, skip_lines=[], tier_numbers=None): ag = poioapi.annotationgraph.AnnotationGraph() parser = ExcelParser(filepath, skip_lines, tier_numbers) converter = poioapi.io.graf.GrAFConverter(parser) converter.parse() ag.tier_hierarchies = converter.tier_hierarchies ag.structure_type_handler = poioapi.data.DataStructureType( ag.tier_hierarchies[0]) ag.graf = converter.graf return ag def unicode_csv_reader(unicode_csv_data, **kwargs): # csv.py doesn't do Unicode; encode temporarily as UTF-8: csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs) for row in csv_reader: # decode UTF-8 back to Unicode, cell by cell: yield [unicode(cell, 'utf-8') for cell in row] def utf_8_encoder(unicode_csv_data): for line in unicode_csv_data: yield line.encode('utf-8') def word_orders(ag, search_terms = None, annotation_map = {}, with_agreement = False): clause_unit_nodes = ag.nodes_for_tier("clause_id") for parent_node in clause_unit_nodes: word_order = [] agreement = [] clause_type = None type_node = ag.nodes_for_tier("clause_type", parent_node) if len(type_node) == 1: clause_type = ag.annotation_value_for_node(type_node[0]) else: raise WrongAnnotationCount( "no clause type in clause unit {0}".format(parent_node.id)) for gramm_node in ag.nodes_for_tier("grammatical_relation", parent_node): a_value = ag.annotation_value_for_node(gramm_node) if search_terms is None or a_value in search_terms: if a_value in annotation_map: a_value = annotation_map[a_value] word_order.append(a_value) if with_agreement: agr_nodes = ag.nodes_for_tier("agreement", gramm_node) if len(agr_nodes) != 1: print("no agreement annotation in clause unit {0} for grammatical relation '{1}'".format(parent_node.id, a_value)) else: agr_node = ag.nodes_for_tier("agreement", gramm_node)[0] agr = ag.annotation_value_for_node(agr_node) agreement.append(agr) yield WordOrder(parent_node.id, word_order, clause_type, agreement) class ExcelParser(poioapi.io.graf.BaseParser): def __init__(self, filepath, skip_lines=[], tier_numbers=None): self.word_orders = dict() self.agreements = dict() self.clauses = list() self.clause_types = dict() self.last_id = -1 with codecs.open(filepath, "r", "utf-8") as csvfile: hinuq2 = csv.reader(csvfile, delimiter="\t") i = 0 for j, row in enumerate(hinuq2): if row[0] in skip_lines: continue if i == tier_numbers["clause_id"]: clause_ids = row #if not clause_ids[0].startswith("#") and not clause_ids[0] == "": # print(j+1) # print(row) elif i == tier_numbers["clause_type"]: clause_types = row elif i == tier_numbers["grammatical_relation"]: grammatical_relations = row elif i == tier_numbers["pos_agreement"]: pos_agreements = row i += 1 if i > tier_numbers["last_line"]: # now parse word_order = [] pos_agreement = [] c_id = None prev_c_id = None for j, clause_id in enumerate(clause_ids): # new clause if clause_id != "": # add word order to previous clause self.word_orders[c_id] = word_order word_order = [] # add new clause c_id = clause_id # self._next_id() if c_id in self.clauses: print("Error: duplicate clause ID: {0}".format(c_id)) continue self.clauses.append(c_id) self.clause_types[c_id] = clause_types[j].strip() grammatical_relation = grammatical_relations[j].strip() if grammatical_relation: pos_agreement = pos_agreements[j].strip() if "zero" in pos_agreement: grammatical_relation = "zero-{0}".format(grammatical_relation) if grammatical_relation == "say": grammatical_relation = "SAY" gr_id = self._next_id() self.agreements[gr_id] = pos_agreement word_order.append((gr_id, grammatical_relation)) self.word_orders[c_id] = word_order i = 0 def _next_id(self): self.last_id += 1 return self.last_id def get_root_tiers(self): return [poioapi.io.graf.Tier("clause_id")] def get_child_tiers_for_tier(self, tier): if tier.name == "clause_id": return [poioapi.io.graf.Tier("grammatical_relation"), poioapi.io.graf.Tier("clause_type")] elif tier.name == "grammatical_relation": return [poioapi.io.graf.Tier("agreement")] return None def get_annotations_for_tier(self, tier, annotation_parent=None): if tier.name == "clause_id": return [poioapi.io.graf.Annotation(v, v) for i, v in enumerate(self.clauses)] elif tier.name == "clause_type": return [poioapi.io.graf.Annotation(self._next_id(), self.clause_types[annotation_parent.id])] elif tier.name == "grammatical_relation": return [poioapi.io.graf.Annotation(gr_id, v) for gr_id, v in self.word_orders[annotation_parent.id]] elif tier.name == "agreement": if annotation_parent and self.agreements[annotation_parent.id]: return [poioapi.io.graf.Annotation(self._next_id(), self.agreements[annotation_parent.id]) ] return [] def tier_has_regions(self, tier): return False def region_for_annotation(self, annotation): pass def get_primary_data(self): pass