#!/usr/bin/python
# Sorts list of genes into transmembrane, secreted, mitochondrial and GPI-anchored category.
# Currently uses phobius TM prediction. hmmtop would be the tmd2 and sosui the tmd3 field-
import sys,dbcnf
db=dbcnf.db
dbc=db.cursor()
fn=sys.argv[1] # input filename
ofn='tmd_%s' % fn # output filename
fields='gene,tmd,secreted,gpi,mito'.split(',')

def prediction_priority((tmd,loc,gpi)):
 """Decides which prediction has priority, as there can only be one"""
 if gpi=='G' and loc=='S' and int(tmd)<2: return { 'gpi' : '1' }
 if int(tmd)>9: return { 'tmd' : '>9' }
 if int(tmd)>0: return { 'tmd' : str(tmd) }
 if loc=='S': return { 'secreted' : '1' }
 if loc=='M': return { 'mito' : '1' }
 return {}

out_text='\t'.join(fields) + '\n'
count_dict={}
genelist=set(map(lambda x : x.strip(),open(fn).readlines()))
for gene in genelist:
 dbc.execute('SELECT tmd,loc,gpi FROM beta WHERE protein=%s',(gene,))
 prediction_dict=prediction_priority(dbc.fetchone())
 if len(prediction_dict):
  cat=prediction_dict.keys()[0]
  if cat=='tmd': cat+=prediction_dict[cat]
  count_dict[cat]=count_dict.get(cat,0)+1
 prediction_dict['gene']=gene
 out_text+='\t'.join(map(lambda x : prediction_dict.get(x,''),fields)) + '\n'
open(ofn,'w').write(out_text)
for kv in count_dict.items(): print '\t'.join(map(str,kv))