#!/usr/bin/perl
# 2010-03-30, Nick Fankhauser
# Downloads slice sequences for a list of gene symbols to fasta format.

use lib "/opt/ensembl/modules";
use Bio::EnsEMBL::Registry;
use strict;
my $registry = 'Bio::EnsEMBL::Registry';
$registry->load_registry_from_db(-host => 'ensembldb.ensembl.org',-user => 'anonymous');

sub gene_sequence {
	my $gene_symbol=shift;
	my $ensembl_id=shift;
	my $ofn2=shift;
	my $gene_adaptor = $registry->get_adaptor( 'Human', 'Core', 'Gene' );
	my $gene = $gene_adaptor->fetch_by_display_label($gene_symbol);
	my $gid=$gene->stable_id();
	my $slice_adaptor = $registry->get_adaptor( 'Human', 'Core', 'Slice' );
	my $slice = $slice_adaptor->fetch_by_gene_stable_id( $gid );
	if ( not defined $slice) {return "";}
	my $seq_region = $slice->seq_region_name();
	my $start      = $slice->start();
	my $end        = $slice->end();
	my $strand     = $slice->strand();
	open (OUT2, ">>$ofn2");	# open position output file
	print OUT2 "$gene_symbol\t$gid\t$seq_region\t$start\t$end\t$strand\n";
	close OUT2;
	my $sseq=$slice->seq();
	my $fasta=">$gene_symbol\n$sseq\n";
	return $fasta;
}

my $fn=shift;	# get filename from first command line argument
my $ofn1=$fn.".seq.txt";
my $ofn2=$fn.".pos.txt";
open (FILE, $fn);	# open the file
open (OUT1, ">$ofn1");	# create sequence output file
close OUT1;
open (OUT2, ">$ofn2");	# create position output file
close OUT2;
while (<FILE>) {	# loop though the file
	chomp();
	next if length($_)<2;
	my @line_split=split("\t");	# split line into columns (Gene_symbol\tENSEMBL_id)
	open (OUT1, ">>$ofn1");	# open output file
	print OUT1 gene_sequence($line_split[0],$line_split[1],$ofn2);
	close OUT1;
}