#!/usr/bin/perl
#This script will concatenate allele sequences defined by allelic profiles.
#Allele sequences should be provided in FASTA format and be ordered 
#sequentially from allele#1.
#
#Version 0.3 Beta
#Written by Keith Jolley, January 2004
#(c) Copyright 2004, University of Oxford

use warnings;
use strict;

use LWP::Simple;

if (!$ARGV[0]){
    print "Usage concat.pl <config file>\n";
    exit(1);
}
my $inputfile = $ARGV[0];
if (!(-e $inputfile)){
    print "Configuration file '$inputfile' does not exist.\n";
    exit(1);
}

my (%locuspath,%orf);
my @loci;
my $profiles;
#parse config file
open (IN,$inputfile) or die "Can not open file '$inputfile'.\n";
while (my $line = <IN>){
    if ($line =~/^\s*locus\s*:([\S]*)\s*=\s*(\S*)\s*:\s*(\d)\s*$/){
	$locuspath{$1}=$2;
	push @loci, $1;
	if ($3 != 1 && $3 != 2 && $3 != 3){
	    print "Reading frame for locus '$1' not valid - should be 1,2 or 3.\n";
	    exit(1);
	}
	$orf{$1}=$3;
    }
    if ($line =~/^\s*profiles\s*=\s*(\S*)\s*$/){
	$profiles=$1;
    }
}
close IN;


#Read in FASTA files either locally or from web site
my %allele;
foreach my $locus (@loci){
    my $fastafile;
    #Check if file is on the web or local
    if ($locuspath{$locus}=~/http:\/\//){
	if (!($fastafile=get ($locuspath{$locus}))){
	    print "Could not download '$locuspath{$locus}'\n";
	    exit(1);
	}
    } else {
	if (-e $locuspath{$locus}){
	    open (FASTA, $locuspath{$locus}) or die "File '$locuspath{$locus}' could not be opened.\n";
	    while (my $line = <FASTA>){
		$fastafile.=$line;
	    }
	    close FASTA;
	} else {
	    print "File '$locuspath{$locus}' does not exist.\n";
	    exit(1);
	}
    }
    my $i=1;
    my @seqs=split />.*?\n/,$fastafile;
    shift @seqs;
    foreach my $seq (@seqs){
	$seq=~s/[\s\r]//g;
	$allele{$locus}[$i]=$seq;
	$i++;
    }
}
#Alleles are stored in a hash of arrays
#so e.g. abcZ-126 can be retrieved as $allele{'abcZ'}[126]

#Parse profiles file
#First row gives column headings
if (!$profiles){
    print "No profile file defined.\n";
    exit(1);
}
my $profilesfile;
if (-e $profiles){
    open (PROFILES, $profiles) or die "File '$profiles' could not be opened.\n";
    while (my $line = <PROFILES>){
	$profilesfile.=$line;
    }
    close PROFILES;
} else {
    print "Profiles file does not exist.\n";
    exit(1);
}
$profilesfile=~s/\r//g; #strip carriage returns
my @profiles=split /\n/,$profilesfile;
my @headings=split /\t/,$profiles[0];

#Check that we have allele files for each heading other than the identifier
if (scalar @headings>1){
    for (my $i = 1;$i<scalar @headings;$i++){
	if (!($locuspath{$headings[$i]})){
	    print "Either no allele file found or configuation set incorrectly for $headings[$i].\n";
	    print "Check format in configuration - should be of form locus:path:orf\n";
	    exit(1);
	}
     }
} else {
    print "No loci headings found in '$profiles'\n";
    exit(1);
}

#Check that we have an allele sequence for each allele number in profiles
if (scalar @profiles>1){
    for (my $i=1;$i<scalar @profiles;$i++){
	my @profile=split /\t/,$profiles[$i];
	if (scalar @profile == 0){ #ignore blank lines
	    next;
	}
	if (scalar @profile != scalar @headings){
	    print "Invalid data for profile '$profile[0]'.\n";
	    exit(1);
	}
	for (my $j=1;$j<scalar @headings;$j++){
	    if (!($allele{$headings[$j]}[$profile[$j]])){
		print "No sequence found for $headings[$j]-$profile[$j].\n";
		exit(1);
	    }
	}
    }
} else {
    print "No profiles found in '$profiles'.\n";
    exit(1);
}
    
#Concatenate the sequences and print them out
shift @profiles; #we don't need the header row
foreach my $singleprofile (@profiles){
    my @profile=split /\t/,$singleprofile;
    if (scalar @profile == 0){ #ignore blank lines
	next;
    }
    print ">$profile[0]\n";
    my $seq;
    for (my $i=1;$i<scalar @headings;$i++){
	my $alleleseq=$allele{$headings[$i]}[$profile[$i]];
	$seq.=substr($alleleseq,$orf{$headings[$i]}-1);
    }
    print &linesplit($seq)."\n";
}

sub linesplit {
    my ($seq)=@_;
    my $i=0;
    my $returnseq;
    my $linepos=0;
    for (my $pos=0;$pos<length($seq);$pos++){
	$linepos++;
	$returnseq.=substr($seq,$pos,1);
	if ($linepos==60){
	    $returnseq.="\n";
	    $linepos=0;
	}
    }
    return $returnseq;
}
