#!/usr/bin/python
# Converts agilent ChIP array data to nimblegen format
# 2008-11-20 Niklaus Fankhauser
import sys,os
if not len(sys.argv)==3:
print 'Usage: agilent2nimblegen <agilent_FeatureExtractor_CSV_output> <design_id>'
sys.exit()
agilent_filename=sys.argv[1]
design_id=sys.argv[2]
used_features=['ProbeName','Sequence','Col','Row','gMedianSignal','rMedianSignal']
format=lambda x : '\t'.join(x) + '\n'
# Process agilent header
for i in open(agilent_filename):
if i.find('FEATURES')==0: break
features=i.strip().split('\t')
if 'probe_mappings' in features: ndf_position='probe_mappings' # Julius new design
elif 'SystematicName' in features: ndf_position='SystematicName' # Julius old design
elif 'Description' in features: ndf_position='Description' # MacAlipine
else: print 'Missing position information.'; sys.exit()
for f in used_features:
if f not in features:
print 'Missing column in data file:',f
sys.exit()
# Write nimblegen files with headers
ndf_filename=agilent_filename+'.ndf'
pos_filename=agilent_filename+'.pos'
did_filename=agilent_filename+'.id.txt'
int1_filename=agilent_filename+'.532.txt'
int2_filename=agilent_filename+'.635.txt'
header='# Converted from %s by agilent2nimblegen by Niklaus Fankhauser\n' % agilent_filename
ndf_header='DESIGN_ID\tSEQ_ID\tPROBE_ID\tPROBE_SEQUENCE\tPOSITION\tX\tY\n'
pos_header='PROBE_ID\tSEQ_ID\tCHROMOSOME\tPOSITION\n'
freq_header='IMAGE_ID\tX\tY\tPM\n'
ndf=open(ndf_filename,'w')
pos=open(pos_filename,'w')
int1=open(int1_filename,'w')
int2=open(int2_filename,'w')
ndf.write(header + ndf_header)
pos.write(header + pos_header)
int1.write(header + freq_header)
int2.write(header + freq_header)
open(did_filename,'w').write(design_id+'\n')
c1,c2,c3=0,0,0
print 'Checks passed, re-distributing data...'
for count,i in enumerate(open(agilent_filename)):
if i.find('DATA')!=0: c1+=1; continue
isp=i.strip().split('\t')
data=lambda x : isp[features.index(x)]
if not data('Sequence'): c2+=1; continue
pos_str=data(ndf_position)
if pos_str.find('chr')==-1: c3+=1; continue
name=data('ProbeName')
position=str(int(pos_str.split(':')[1].split('-')[0]))
chromosome=pos_str.split('chr')[1].split(':')[0]
x=str(int(data('Col'))-1)
y=str(int(data('Row'))-1)
ndf.write(format([design_id,name,name,data('Sequence'),position,x,y]))
pos.write(format([name,name,chromosome,position]))
int1.write(format([design_id+'_532',x,y,data('gMedianSignal')]))
int2.write(format([design_id+'_635',x,y,data('rMedianSignal')]))
ndf.close(); pos.close(); int1.close(); int2.close()
print '%d probes processed.' % count
print 'Created:\n', '\n'.join((ndf_filename,pos_filename,did_filename,int1_filename,int2_filename))
print 'Lines without data: %d' % c1
print 'Lines without sequence: %d' % c2
print 'Lines without position: %d' % c3