#!/usr/bin/perl
# List of chromosome regions (chrX:P1-P2) -> Name/IDs of genes at or near position
# 2009-02-18 by Nick Fankhauser

use lib "/opt/ensembl/modules";
use Bio::EnsEMBL::Registry;
use strict;
my $debug=0; # show information about multiple genes, if they are found at one position
my $range=10000; # how far to search up- and downstream
my $registry = 'Bio::EnsEMBL::Registry';
$registry->load_registry_from_db( -host => 'ensembldb.ensembl.org', -user => 'anonymous' );

# Slice adaptor for mouse genome:
my $slice_adaptor = $registry->get_adaptor( 'Mouse', 'Core', 'Slice' );
my $tr_adaptor    = $registry->get_adaptor( 'Mouse', 'Core', 'Transcript' );
my $gene_adaptor = $registry->get_adaptor( 'Mouse', 'Core', 'Gene' );

sub chromo_pos { # returns transcript adaptor
	my $adaptor=shift;	# adaptor
	my $chromo=shift;	# chromosome
	my $pos1=int(shift);		# start position in nucleotides on chromosome
	my $pos2=int(shift);		# end position
	# Get a slice object for the specified chromosome region:
	my $slice = $slice_adaptor->fetch_by_region( 'chromosome', $chromo, $pos1, $pos2 );
	# Get transcript adaptor for this slice:
	my @transcripts = @{ $adaptor->fetch_all_by_Slice($slice) };
	return @transcripts;	# our small slices have only have one transcript at max
}

sub output {
	my $s=shift;	# output string
	my $fn=shift;	# input filename
	my $logfile="$fn.log";	# output filename
        unless (-r $logfile) {open(L,">$logfile");close L}	# create if not exists
        open(L,">>$logfile");print L $s;close L;		# append string
	print $s;	# print string
}

sub xref {
	my $a=shift;
	return '-' if not $a;
	my $db_entries = $a->get_all_DBEntries();
	my %rh=();
	my $other_id;
	foreach my $dbe ( @{$db_entries} ) {
		$rh{$dbe->dbname()}=$dbe->display_id();
		$other_id=$dbe->display_id(); # Added 2009-01-23
	}
	# Return RefSeq_dna or UniGene, if present
	if ( defined $rh{ "UniGene" } ) {return $rh{ "UniGene" };}
	if ( defined $rh{ "RefSeq_dna" } ) {return $rh{ "RefSeq_dna" };}
	return $other_id; # Added 2009-01-23: return any ID instead of none
}


sub ens {my $a=shift;return '-' if not $a;return $a->stable_id();}
sub symbol {my $a=shift;return '-' if not $a;return $a->external_name();}
sub strand {my $a=shift;return '-' if not $a;return $a->strand();}

sub display_genes {
	my $array_ref=shift;
	my $info=shift;
	print "\n";
	foreach ( @{$array_ref} ) {
		print "Symbol: ",symbol($_),"\n";
		print "ID: ",ens($_),"\n";
		print "Start: ",$_->start(),"\n";
		print "End: ",$_->end(),"\n";
		print "Strand: ",$_->strand(),"\n";
		print "\n";
	}
}

sub find_genes {
	my $chromo=int(shift);	# chromosome
	my $start_pos=shift;	# start position on chromosome in bp
	my $end_pos=shift;	# end position on chromosome in bp
	my $info=shift;		# Information from the input line
	my $fn=shift;	# input filename
	# start from middle of the query chromosome area
	my $middle=int(($start_pos + $end_pos) / 2.0);
	my @trans_list=chromo_pos($tr_adaptor,$chromo,$middle-$range,$middle+$range);
	display_genes(\@trans_list,$info) if $debug;
	my %dists=(); # perl-hash: distance -> transcript object
	foreach ( @trans_list ) {
		my $distance_to_start;
		# In ensembl, the gene-start is "start" on strand=1 and "end" on strand=-1
		# Distance to where the gene was found has to be subtracted.
		if ($_->strand()==1) {$distance_to_start=($_->start()-$range)}
		else {$distance_to_start=$_->end()-$range}
		$dists{$distance_to_start}=$_;		
	}
	my $down; # nearest gene downstream
	my $up; # nearest gene upstream
	my $down_distance=0; # distance in bp to nearest gene downstream
	my $up_distance=0; # distance in bp to nearest gene upstream
	foreach my $d (sort { $a <=> $b } keys %dists){ # numerically sorted by distance
		next if $d>0; # skip positive distances
		$up=$dists{$d}; # stores nearest upstream gene in last loop iteration
		if ($dists{$d}->strand()==1) {$up_distance=-1*$d;}
		else {$up_distance=$d}
	}
	foreach my $d (sort { $a <=> $b } keys %dists){ # numerically sorted by distance
		next if $d<0; # skip negative distances
		$down=$dists{$d}; # stores nearest upstream gene in first loop iteration
		if ($dists{$d}->strand()==1) {$down_distance=-1*$d;}
		else {$down_distance=$d}
		last;
	}
	# print/save to a line of tab-saparated text
	output( sprintf("%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\t%s\t%d\n", $info,
	ens($down), symbol($down), xref($down), strand($down), $down_distance,
	ens($up), symbol($up), xref($up), strand($up), $up_distance),$fn);
}

my $fn=shift;	# get filename from first command line argument
die if not -r $fn;	# file has to exist
my $header1="Symbol\tChromosome\tStart-Pos\tEnd-Pos\t";
my $header2="Down-Ensembl\tDown-Symbol\tDown-ID\tDown-Strand\tDown-Distance\t";
my $header3="Up-Ensembl\tUp-Symbol\tUp-ID\tUp-Strand\tUp-Distance\n";
my $header=$header1.$header2.$header3;
open (FILE, $fn);	# open the file
my $cnt=0;	# line counter
unlink("$fn.log");
output($header,$fn);
while (<FILE>) {	# loop though the file
	$cnt++;	# increment the line counter
	next if $cnt<2;	# skip table header
	my @line_split=split("\t");	# split line into columns
	my $gene_symbol=$line_split[0];	# Primary Annotation
	my $chromo=$line_split[1];	# Chromosome number
	$chromo=~s/chr//;		# remove the "chr" string
	my $start_pos=$line_split[2];	# Start postion on this chromosome
	my $end_pos=$line_split[3];	# End postion on this chromosome
	next if not $chromo;
	next if not $start_pos;
	next if not $end_pos;
	my $info="$gene_symbol\t$chromo\t$start_pos\t$end_pos";
	find_genes($chromo,$start_pos,$end_pos,$info,$fn);
}
close (FILE);