#!/usr/bin/perl
# List of chromosome regions (chrX:P1-P2) -> Name/IDs of genes at or near position
use lib "/opt/ensembl/modules";
use Bio::EnsEMBL::Registry;
use strict;

my $registry = 'Bio::EnsEMBL::Registry';
$registry->load_registry_from_db(
    -host => 'ensembldb.ensembl.org',
    -user => 'anonymous'
);

# Slice adaptor for mouse genome:
my $slice_adaptor = $registry->get_adaptor( 'Mouse', 'Core', 'Slice' );
my $tr_adaptor    = $registry->get_adaptor( 'Mouse', 'Core', 'Transcript' );
my $gene_adaptor = $registry->get_adaptor( 'Mouse', 'Core', 'Gene' );

sub chromo_pos { # returns gene adaptor
	my $chromo=int(shift);	# chromosome
	my $pos1=int(shift);		# start position in nucleotides on chromosome
	my $pos2=int(shift);		# end position
	# Get a slice object for the specified chromosome region:
	my $slice = $slice_adaptor->fetch_by_region( 'chromosome', $chromo, $pos1, $pos2 );
	# Get transcript adaptor for this slice,
	# now using gene adaptor instead of transcript adaptor:
	my @transcripts = @{ $gene_adaptor->fetch_all_by_Slice($slice) };
	return $transcripts[0]	# our small slices have only have one transcript at max
}

sub chromo_pos2 { # returns transcript adaptor
	my $chromo=int(shift);	# chromosome
	my $pos1=int(shift);		# start position in nucleotides on chromosome
	my $pos2=int(shift);		# end position
	# Get a slice object for the specified chromosome region:
	my $slice = $slice_adaptor->fetch_by_region( 'chromosome', $chromo, $pos1, $pos2 );
	# Get transcript adaptor for this slice:
	my @transcripts = @{ $tr_adaptor->fetch_all_by_Slice($slice) };
	return $transcripts[0]	# our small slices have only have one transcript at max
}

sub gene_search_unlim { # not used anymore
	my $chromo=int(shift);	# chromosome
	my $p1=shift;	# start position on chromosome in bp
	my $p2=shift;	# end position on chromosome in bp
	my $p1f=shift;	# factor for start position
	my $p2f=shift;	# factor for end position
	my $i=0;	# increment counter
	# increment end-position by 10000 until a gene is found
	while ( not chromo_pos($chromo,$p1+$i*$p1f,$p2+$i*$p2f) ) {$i+=10000}
	# decrement by 1000 until the gene is not found anymore
	while ( chromo_pos($chromo,$p1+$i*$p1f,$p2+$i*$p2f) ) {$i-=1000}
	$i+=1000;	# add 1000 to move back into the gene
	# decrement by 100 until the gene is not found anymore
	while ( chromo_pos($chromo,$p1+$i*$p1f,$p2+$i*$p2f) ) {$i-=100}
	$i+=100; # add 100 to move back into the gene
	# decrement by 10 until the gene is not found anymore
	while ( chromo_pos($chromo,$p1+$i*$p1f,$p2+$i*$p2f) ) {$i-=10}
	$i+=10; # add 10 to move back into the gene
	# decrement by 1 until the gene is not found anymore
	while ( chromo_pos($chromo,$p1+$i*$p1f,$p2+$i*$p2f) ) {$i-=1}
	return $i+1;	# return position inside gene
}

sub gene_search10000 { # Added 2009-01-23: only search 10000bp
	my $chromo=int(shift);	# chromosome
	my $p1=shift;	# start position on chromosome in bp
	my $p2=shift;	# end position on chromosome in bp
	my $p1f=shift;	# factor for start position
	my $p2f=shift;	# factor for end position
	my $i=0;	# increment counter
	# increment end-position by 1000 up to 10000 or a gene is found
	for ($i=0;$i<=10000;$i+=1000) {last if chromo_pos($chromo,$p1+$i*$p1f,$p2+$i*$p2f)}
	if (not chromo_pos($chromo,$p1+$i*$p1f,$p2+$i*$p2f)) {return 0}
	# decrement by 1000 until the gene is not found anymore
	while ( chromo_pos($chromo,$p1+$i*$p1f,$p2+$i*$p2f) ) {$i-=1000}
	$i+=1000;	# add 1000 to move back into the gene
	# decrement by 100 until the gene is not found anymore
	while ( chromo_pos($chromo,$p1+$i*$p1f,$p2+$i*$p2f) ) {$i-=100}
	$i+=100; # add 100 to move back into the gene
	# decrement by 10 until the gene is not found anymore
	while ( chromo_pos($chromo,$p1+$i*$p1f,$p2+$i*$p2f) ) {$i-=10}
	$i+=10; # add 10 to move back into the gene
	# decrement by 1 until the gene is not found anymore
	while ( chromo_pos($chromo,$p1+$i*$p1f,$p2+$i*$p2f) ) {$i-=1}
	return $i+1;	# return position inside gene
}

sub output {
	my $s=shift;	# output string
	my $fn=shift;	# input filename
	my $logfile="$fn.log";	# output filename
        unless (-r $logfile) {open(L,">$logfile");close L}	# create if not exists
        open(L,">>$logfile");print L $s;close L;		# append string
	print $s;	# print string
}

sub xref {
	my $db_entries = shift->get_all_DBEntries();
	my %rh=();
	my $other_id;
	foreach my $dbe ( @{$db_entries} ) {
		$rh{$dbe->dbname()}=$dbe->display_id();
#		print $dbe->dbname(),": ",$dbe->display_id(),"\n";
		$other_id=$dbe->display_id(); # Added 2009-01-23
	}
	# Return RefSeq_dna or UniGene, if present
	if ( defined $rh{ "UniGene" } ) {return $rh{ "UniGene" };}
	if ( defined $rh{ "RefSeq_dna" } ) {return $rh{ "RefSeq_dna" };}
	return $other_id; # Added 2009-01-23: return any ID instead of none
}

sub find_genes {
	my $chromo=int(shift);	# chromosome
	my $start_pos=shift;	# start position on chromosome in bp
	my $end_pos=shift;	# end position on chromosome in bp
	my $info=shift;		# Information from the input line
	my $fn=shift;	# input filename
	# Search for position of next downstream gene by extending slice end position
	my $gene_pos_plus=gene_search10000($chromo,$start_pos,$end_pos,0,1);
	# Get transcript adaptor for gene at this position 
	my $tr_plus=chromo_pos($chromo,$start_pos,$end_pos+$gene_pos_plus); # gene
	my $tr_plus2=chromo_pos2($chromo,$start_pos,$end_pos+$gene_pos_plus); # transcript
	# Search for position of next upstream gene by extending slice end position
	my $gene_pos_minus=gene_search10000($chromo,$start_pos,$end_pos,-1,0);
	# Get transcript adaptor for gene at this position
	my $tr_minus=chromo_pos($chromo,$start_pos-$gene_pos_minus,$end_pos); # gene
	my $tr_minus2=chromo_pos2($chromo,$start_pos-$gene_pos_minus,$end_pos); # transcript
	# Added 2009-01-23: its now possible to have no gene up or downstream
	my $plus_sign="";
	my $plus_ens="-";
	my $plus_name="-";
	my $plus_id="-";
	my $plus_strand="-";
	my $plus_distance="-";
	if ($tr_plus) { # gene found downstream?
		$plus_strand=$tr_plus->strand();
		# Sign (+/-) for downstream position depends on which strand gene is on
		if ($plus_strand==1) {$plus_sign="-";}
		else {$plus_sign="+";}
		$plus_name=$tr_plus->external_name();
		$plus_ens=$tr_plus->stable_id();
		$plus_id=xref($tr_plus2);
		if ($plus_strand==1) {$plus_distance=$tr_plus->start();}
		else {$plus_distance=$tr_plus->end();}
	}
	my $minus_sign="";
	my $minus_ens="-";
	my $minus_name="-";
	my $minus_id="-";
	my $minus_strand="-";
	my $minus_distance="-";
	if ($tr_minus) { # gene found upstream?
		$minus_strand=$tr_minus->strand();
		# Sign (+/-) for upstream position depends on which strand gene is on	
		if ($minus_strand==1) {$minus_sign="+";}
		else {$minus_sign="-";}
		$minus_name=$tr_minus->external_name();
		$minus_ens=$tr_minus->stable_id();
		$minus_id=xref($tr_minus2);
		if ($minus_strand==1) {$minus_distance=$gene_pos_minus - $tr_minus->start();}
		else {$minus_distance=$gene_pos_minus - $tr_minus->end();}
	}
	# print/save to a line of tab-saparated text
	output( sprintf("%s\tOut\t%s\t%s\t%s\t%s\t%s%d\t%s\t%s\t%s\t%s\t%s%d\n", $info,
	$plus_ens, $plus_name, $plus_id, $plus_strand, $plus_sign, $plus_distance, 
	$minus_ens, $minus_name, $minus_id, $minus_strand, $minus_sign, $minus_distance),$fn);
}

my $fn=shift;	# get filename from first command line argument
die if not -r $fn;	# file has to exist
my $header1="Symbol\tChromosome\tStart-Pos\tEnd-Pos\tOutOrIn\t";
my $header2="Down-Ensembl\tDown-Symbol\tDown-ID\tDown-Strand\tDown-Distance\t";
my $header3="Up-Ensembl\tUp-Symbol\tUp-ID\tUp-Strand\tUp-Distance\n";
my $header=$header1.$header2.$header3;
open (FILE, $fn);	# open the file
my $cnt=0;	# line counter
unlink("$fn.log");
output($header,$fn);
while (<FILE>) {	# loop though the file
	$cnt++;	# increment the line counter
	next if $cnt<2;	# skip table header
	my @line_split=split("\t");	# split line into columns
	my $gene_symbol=$line_split[0];	# Primary Annotation
	my $chromo=$line_split[1];	# Chromosome number
	$chromo=~s/chr//;		# remove the "chr" string
	my $start_pos=$line_split[2];	# Start postion on this chromosome
	my $end_pos=$line_split[3];	# End postion on this chromosome
	next if not $chromo;
	next if not $start_pos;
	next if not $end_pos;
	my $info="$gene_symbol\t$chromo\t$start_pos\t$end_pos";
	my $tr=chromo_pos($chromo,$start_pos,$end_pos);
	my $tr2=chromo_pos2($chromo,$start_pos,$end_pos);
	if ($tr) {
		my $name=$tr->external_name();
		my $id=xref($tr2);
		my $distance;
		if ($tr->strand()==1) {$distance=$tr->start();}
		else {$distance=$tr->end();}
		my $strand=$tr->strand();
		my $ens=$tr->stable_id();
		# Added 2009-01-23: same format for inside gene as outside gene
		output( sprintf("%s\tIn\t%s\t%s\t%s\t%s\t%s%d\t%s\t%s\t%s\t%s\t%s%d\n", $info,
		$ens, $name, $id, $strand, "", $distance, $ens, $name, $id, $strand, "", $distance),$fn);
	} else {
		find_genes($chromo,$start_pos,$end_pos,$info,$fn);
	}
}
close (FILE);