#!/usr/bin/python
# Converts agilent ChIP array data to nimblegen format
# 2008-11-20 Niklaus Fankhauser
import sys,os
if not len(sys.argv)==3:
 print 'Usage: agilent2nimblegen <agilent_FeatureExtractor_CSV_output> <design_id>'
 sys.exit()
agilent_filename=sys.argv[1]
design_id=sys.argv[2]
used_features=['ProbeName','Sequence','Col','Row','gMedianSignal','rMedianSignal']
format=lambda x : '\t'.join(x) + '\n'

# Process agilent header
for i in open(agilent_filename):
 if i.find('FEATURES')==0: break
features=i.strip().split('\t')
if 'probe_mappings' in features: ndf_position='probe_mappings' # Julius new design
elif 'SystematicName' in features: ndf_position='SystematicName' # Julius old design
elif 'Description' in features: ndf_position='Description' # MacAlipine
else: print 'Missing position information.'; sys.exit()
for f in used_features:
 if f not in features:
  print 'Missing column in data file:',f
  sys.exit()

# Write nimblegen files with headers
ndf_filename=agilent_filename+'.ndf'
pos_filename=agilent_filename+'.pos'
did_filename=agilent_filename+'.id.txt'
int1_filename=agilent_filename+'.532.txt'
int2_filename=agilent_filename+'.635.txt'
header='# Converted from %s by agilent2nimblegen by Niklaus Fankhauser\n' % agilent_filename
ndf_header='DESIGN_ID\tSEQ_ID\tPROBE_ID\tPROBE_SEQUENCE\tPOSITION\tX\tY\n'
pos_header='PROBE_ID\tSEQ_ID\tCHROMOSOME\tPOSITION\n'
freq_header='IMAGE_ID\tX\tY\tPM\n'
ndf=open(ndf_filename,'w')
pos=open(pos_filename,'w')
int1=open(int1_filename,'w')
int2=open(int2_filename,'w')
ndf.write(header + ndf_header)
pos.write(header + pos_header)
int1.write(header + freq_header)
int2.write(header + freq_header)
open(did_filename,'w').write(design_id+'\n')
c1,c2,c3=0,0,0
print 'Checks passed, re-distributing data...'
for count,i in enumerate(open(agilent_filename)):
 if i.find('DATA')!=0: c1+=1; continue
 isp=i.strip().split('\t')
 data=lambda x : isp[features.index(x)]
 if not data('Sequence'): c2+=1; continue
 pos_str=data(ndf_position)
 if pos_str.find('chr')==-1: c3+=1; continue
 name=data('ProbeName')
 position=str(int(pos_str.split(':')[1].split('-')[0]))
 chromosome=pos_str.split('chr')[1].split(':')[0]
 x=str(int(data('Col'))-1)
 y=str(int(data('Row'))-1)
 ndf.write(format([design_id,name,name,data('Sequence'),position,x,y]))
 pos.write(format([name,name,chromosome,position]))
 int1.write(format([design_id+'_532',x,y,data('gMedianSignal')]))
 int2.write(format([design_id+'_635',x,y,data('rMedianSignal')]))

ndf.close(); pos.close(); int1.close(); int2.close()
print '%d probes processed.' % count
print 'Created:\n', '\n'.join((ndf_filename,pos_filename,did_filename,int1_filename,int2_filename))
print 'Lines without data: %d' % c1
print 'Lines without sequence: %d' % c2
print 'Lines without position: %d' % c3