#!/usr/bin/env python
# Parses gene interactions from KEGG pathways
# Based on the KEGG XML parser by dalloliogm
# Data source:
# ftp://ftp.genome.jp/pub/kegg/xml/kgml/metabolic/organisms/hsa/
# ftp://ftp.genome.jp/pub/kegg/xml/kgml/non-metabolic/organisms/hsa/

import xml.etree.cElementTree as ET
import logging
import sys

def KGML2Graph(xmlfile, filter_by = ()):
    nodes = {}
    genes = []
    pathway_reactions = {}
    pathway_types = {}
    pathway_edges = []
    pathway_relations = {}
    pathway_labels = {}     # dictionary to keep node labels (gene name?)
    tree = ET.parse(xmlfile)
    # Determine whether this is a KO or organism-specific map
    organism = tree.getroot().get('org')
    if organism == 'ko':
        entriestype = ('ortholog', 'map', 'compound',)
    elif organism == 'ec':
        raise NotImplementedError('Didn\'t implement EC pathways yet')
    else:   # this is an organism-specific pathway
        entriestype = ('gene', 'compound', 'map')
    # Get pathway title (store it in pathway.title)
    pathway_title = tree.getroot().get('title')
    pathway_name = tree.getroot().get('name')
    pathway_id = tree.getroot().get('id')
    # parse and add nodes
    for entry in tree.getiterator('entry'):
        # get all genes or compounds, and associate ids to names
        logging.debug(entry.get('type') + ' ' + entry.get('id'))
        node_type = entry.get('type')   # can be ('gene', 'compound', 'map'..)
        name = entry.get('name')
        node_id = entry.get('id')
        graphics = entry.find('graphics')
        node_title = graphics.get('name')
        logging.debug(node_title)
        nodes[node_id] = (name, node_title, node_type)
        pathway_labels[node_id] = node_title
        pathway_types[node_id]=node_type
    # parse and add relations
    for rel in tree.getiterator('relation'):
        e1 = rel.get('entry1')
        e2 = rel.get('entry2')
        pathway_edges.append([e1, e2])
        pathway_relations[e1+'_'+e2] = rel
    # Add reactions to pathway object
    for reaction in tree.getiterator('reaction'):
        rid = reaction.get('name')
        substrates = []
        products = []
        for sub in reaction.getiterator('substrate'):
            substrates.append(sub.get('name'))
        for prod in reaction.getiterator('product'):
            products.append(sub.get('name'))
        pathway_reactions[rid] = {'reaction': reaction, 'substrates': substrates, 'products': products}
    return pathway_name,pathway_title,pathway_labels,pathway_types,pathway_edges

name,title,labels,types,edges=KGML2Graph(sys.argv[1])
for e1,e2 in edges:
 if types[e1]!='gene' or types[e2]!='gene': continue
 print '%s\t%s\t%s\t%s' % (labels[e1].replace('...',''),labels[e2].replace('...',''),title,name)