Source code for gocats.ontologyparser

# !/usr/bin/python3
"""
A parser which reads ontologies in the OBO format and calls appropriate graph objects to store information in a graph
representation. Separate parsing classes within this module operate on distinct ontologies in the OBO Foundry to handle
any subtle differences among ontologies.
"""
import re
from .dag import AbstractEdge, DirectionalRelationship
from .godag import GoGraphNode


[docs]class OboParser(object): """A scaffolding for parsing OBO formatted ontologies. Contains regular expressions for the basic stanzas and information pertinent for creating a graph object of an ontology."""
[docs] def __init__(self): """`OboParser` initializer. Contains Regular Expressions for identifying crucial information from OBO formatted ontologies. """ self.term_stanza = re.compile('\[Term\]') self.go_term = re.compile('GO\:\d{7}') self.stanza_id = re.compile('^id:') self.stanza_name = re.compile('^name:') self.namespace = re.compile('^namespace:') self.term_definition = re.compile('^def:') self.obsolete = re.compile('^is_obsolete:\strue') self.is_a = re.compile('^is_a:') self.relationship_match = re.compile('^relationship:') self.end_stanza = re.compile('^\s+') self.typedef_stanza = re.compile('^\[Typedef\]') self.inverse_tag = re.compile('^inverse_of:')
# May use later. # self.comment = re.compile('\!.+') # self.subset = re.compile('^subset:')
[docs]class GoParser(OboParser): """An ontology parser specific to Gene Ontology"""
[docs] def __init__(self, database_file, go_graph, relationship_directionality='gocats'): """`GoParser` initializer. Parses a Gene Ontology database file and adds properties found therein to a :class:`godag.GoGraph` object. **Importantly:** includes descriptions of semantic directionality of all GO relationships. :param file_handle database_file: Specify the location of a Gene Ontology .obo file. :param go_graph: :class:`gocats.godag.GoGraph` object. :return: None :rtype: :py:obj:`None` """ super().__init__() self.database_file = database_file self.go_graph = go_graph # 5 types of relationships: scoping, scaling (grouped with scoping for now), spatiotemporal, active, equivalence (not present here), and other. # 1 means that the relationship directionality is conventional, 0 means that the semantic directionality points from node 2 to node 1. if relationship_directionality == 'gocats': self.relationship_mapping = {"ends_during": ("spatiotemporal", 1), "happens_during": ("spatiotemporal", 1), "has_part": ("scoping", 0), "negatively_regulates": ("active", 1), "never_in_taxon": ("other", 1), "occurs_in": ("spatiotemporal", 1), "part_of": ("scoping", 1), "positively_regulates": ("active", 1), "regulates": ("active", 1), "starts_during": ("spatiotemporal", 1), "is_a": ("scoping", 1), "term_tracker_item": ("other", 1)} else: self.relationship_mapping = {"ends_during": ("spatiotemporal", 1), "happens_during": ("spatiotemporal", 1), "has_part": ("scoping", 1), "negatively_regulates": ("active", 1), "never_in_taxon": ("other", 1), "occurs_in": ("spatiotemporal", 1), "part_of": ("scoping", 1), "positively_regulates": ("active", 1), "regulates": ("active", 1), "starts_during": ("spatiotemporal", 1), "is_a": ("scoping", 1), "term_tracker_item": ("other", 1)}
[docs] def parse(self): """Parses the ontology database file and accesses the ontology graph object to add information found in the database. Once all information is added, this function calls the graph's instantiate_valid_edges function to connect all nodes in the graph by their edges. :return: None :rtype: :py:obj:`None` """ is_term = False is_typedef = False for line in self.database_file: if not is_term and not is_typedef and re.match(self.term_stanza, line): is_term = True node = GoGraphNode() node_edge_list = [] if not is_typedef and not is_term and re.match(self.typedef_stanza, line): is_typedef = True relationship_obj = DirectionalRelationship() elif is_term: if re.match(self.stanza_id, line): node.id = re.findall(self.go_term, line)[0] curr_stanza_id = node.id elif re.match(self.stanza_name, line): node.name = line[6:-1].lower() # ignores 'name: ' elif re.match(self.namespace, line): node.namespace = line[11:-1].lower() # ignores 'namespace: ' elif re.match(self.term_definition, line): node.definition = re.findall('\"(.*?)\"', line)[0].lower() # This pattern matches the definition listed within quotes on the line. elif re.match(self.is_a, line): node_edge = AbstractEdge(curr_stanza_id, re.findall(self.go_term, line)[0], 'is_a') # node1, node2, relationship node_edge_list.append(node_edge) if "is_a" not in self.go_graph.relationship_index: is_a_relationship = DirectionalRelationship() is_a_relationship.id = "is_a" is_a_relationship.name = "is a" is_a_relationship.category = self.relationship_mapping["is_a"][0] is_a_relationship.direction = self.relationship_mapping["is_a"][1] self.go_graph.used_relationship_set.add(is_a_relationship.id) self.go_graph.add_relationship(is_a_relationship) elif re.match(self.relationship_match, line): relationship_id = re.findall("[\w]+", line)[1] # line example: relationship: part_of GO:0040025 ! vuval development node_edge = AbstractEdge(curr_stanza_id, re.findall(self.go_term, line)[0], relationship_id) node_edge_list.append(node_edge) elif re.match(self.obsolete, line): node.obsolete = True elif re.match(self.end_stanza, line): if self.go_graph.valid_node(node): self.go_graph.add_node(node) for edge in node_edge_list: if not self.go_graph.allowed_relationships or edge.relationship_id in self.go_graph.allowed_relationships: self.go_graph.add_edge(edge) self.go_graph.used_relationship_set.add(edge.relationship_id) if node_edge_list == [] and node.obsolete == False: # Have to look at the local edge list because nodes have not been linked with edges yet. Entire graph must be populated first. This is the only way to do this on-the-fly. self.go_graph.root_nodes.append(node) # make root nodes a set of all namespaces used in the ontology. is_term = False elif is_typedef: if re.match(self.stanza_id, line): relationship_obj.id = re.findall(r"[\w+\:]+", line)[1] elif re.match(self.stanza_name, line): relationship_obj.name = re.findall(r"[\w+\:]+", line)[1] elif re.match(self.inverse_tag, line): relationship_obj.inverse_relationship_id = re.findall(r"[\w+\:]+", line)[1] elif re.match(self.end_stanza, line): properties = self.relationship_mapping[relationship_obj.id] relationship_obj.category = properties[0] relationship_obj.direction = properties[1] self.go_graph.add_relationship(relationship_obj) is_typedef = False self.go_graph.instantiate_valid_edges()