#!/usr/bin/perl
#This script will setup postgres MLST profile databases by downloading
#the configuration from pubmlst.org.  It can be set to download the 
#appropriate allele sequences and profiles so that the database can be
#kept up-to-date.
#
#Version 0.2 Beta
#Written by Keith Jolley, December 2003-2008
#(c) Copyright 2003-2008, University of Oxford

use warnings;
use strict;

use DBI;
use LWP::Simple;
use Getopt::Std;

###Local configuration values########################
my $configfile='dbases.cfg';
my $webroot='http://pubmlst.org/data/';
my $dbasename='_profiles'; #suffix of database name
my $dbmain='template1';
#####################################################

#Check arguments
#s - setup
#u - update (followed by name of database)
#q - quiet
#f - force
my %opts;
getopts('sqfu:', \%opts);
if (!$opts{'s'} and !$opts{'u'}){
    print "Usage mlstdbase.pl [-sqf] [-u <dbase code>]\n";
    print "Exiting. \n";
    exit(1);
}
if ($opts{'s'} and $opts{'u'}){
    print "The -s (setup) and -u (update) options are mutually exclusive!\n";
    print "Exiting. \n";
    exit(1);
}

#Get list of databases installed on the system
my @dbases;
my $db=DBI->connect("DBI:Pg:dbname=$dbmain") or die "couldn't open template db";
my $sql=$db->prepare("SELECT pg_database.datname") or die "couldn't prepare";
$sql->execute();
while (my $dbase=$sql->fetchrow_array()){
    push @dbases,$dbase;
}
$db->disconnect;

#Download config file to find out which databases are available
#Setup new database if appropriate.
my @data;
my @scheme;
my @dir;
my @loci;
my $clonal_complex=0;
if (my $cfg=get ($webroot.$configfile)){ 
    @data=split /\#\#\#/,$cfg;
    foreach my $schemecfg (@data){
	if ($schemecfg=~/scheme\s*=\s*(.+);/){
	    push @scheme,$1;
	}
	if ($schemecfg=~/dir\s*=\s*(.+);/){
	    push @dir,$1;
	}
    }
    if (!$opts{'q'}){
	print "Database description file retrieved from pubmlst.org.\n\n";
    }
    if ($opts{'u'}){
	&updatedb($opts{'u'});
    }
    if ($opts{'s'}){
	my $db=DBI->connect("DBI:Pg:dbname=$dbmain",'postgres') or die "couldn't open template db";
	print "The following databases are available to set up:\n";
	my $num;
	my %choices;
	for (my $i=0; $i<scalar @scheme; $i++){
	    $num=$i+1;
	    $choices{$num}=1;
	    print "$num) $scheme[$i]\n";
	}
	print "\nQ) Quit\n";
	$choices{'Q'}=1;
	$choices{'q'}=1;
	my $choice="";
	while (!$choices{$choice}){
	    print "Please enter your choice: ";
	    chomp($choice = <STDIN>);
	}
	if ($choice =~ /[Qq]/){
	    print "Quit.\n";
	    $db->disconnect;
	    exit(0);
	}
	print "Setting up $scheme[$choice-1] profiles database ...\n";
	my $newdbase=$dir[$choice-1].$dbasename;
	print "Database name: $newdbase\n";
	#Check if selected database name matches one that already
	#exists and see if it is ok to overwrite.
	if (&dbexists($newdbase)){
	    print "Database $newdbase already exists.\n";
	    print "Is it ok to overwrite [y/n]? ";
	    my $response;
	    chomp ($response=<STDIN>);
	    if (lc($response eq 'y') or lc($response eq 'yes')){
		print "Deleting existing database '$newdbase' ...\n";
		$db->do("DROP DATABASE $newdbase") and print "Database deleted.\n" or die "Couldn't delete database.\n";
	    } else {
		print "Exiting.\n";
		$db->disconnect;
		exit(0);
	    }
	}
	print "Creating new database '$newdbase' ...\n";
	$db->do("CREATE DATABASE $newdbase") and print "Database created.\n" or die "Couldn't create database.\n";
	push @dbases,$newdbase;
	$db->disconnect;
	$db=DBI->connect("DBI:Pg:dbname=$newdbase") or die "couldn't open database $newdbase";
	$db->do("CREATE TABLE users (id INTEGER NOT NULL UNIQUE,user_name varchar NOT NULL UNIQUE,surname varchar NOT NULL,first_name varchar NOT NULL,email varchar NOT NULL,affiliation varchar NOT NULL,status varchar NOT NULL,datestamp date NOT NULL,curator varchar NOT NULL,PRIMARY KEY (id))") and print "Table 'users' created.\n" or die "Couldn't create table 'users'!\n";
	$db->do("INSERT INTO users VALUES(1,'auto','Script','Webscraper','-','-','curator','today','auto')");
	$db->do("GRANT SELECT ON users TO apache");
	$db->do("GRANT SELECT ON users TO remote");
	my $line;
	my @temp=split /;/, $data[$choice-1];
	foreach $line (@temp){
	    if ($line=~/locus\s*=\s*(.+)\s*,\s*(\d+)/){
		print "Creating locus table: $1 ...\n";
		my $locus=$1;
		push @loci, $locus;
		$"=',';
		my $qry="CREATE TABLE $locus (id INTEGER NOT NULL UNIQUE, sender integer NOT NULL, datestamp date NOT NULL,curator integer NOT NULL,sequence varchar NOT NULL UNIQUE, PRIMARY KEY (id),CONSTRAINT $locus\_cur FOREIGN KEY (curator) REFERENCES users ON DELETE NO ACTION ON UPDATE CASCADE,CONSTRAINT $locus\_sen FOREIGN KEY (sender) REFERENCES users ON DELETE NO ACTION ON UPDATE CASCADE)";
		$db->do($qry) and print "Table '$locus' created.\n" or die "Couldn't create table '$locus'!\n";
		$db->do("GRANT SELECT ON $locus TO apache");
		$db->do("GRANT SELECT ON $locus TO remote");
	    }
	    if ($line=~/clonal_complex/){
		$clonal_complex=1;
	    }
	}
	
	$"=' INTEGER NOT NULL, ';
	my $qry="CREATE TABLE profiles (st INTEGER NOT NULL UNIQUE, @loci INTEGER NOT NULL,sender integer NOT NULL,curator integer NOT NULL,datestamp date NOT NULL,";
	if ($clonal_complex){
	    $qry.='clonal_complex varchar,';
	}
	$qry.='PRIMARY KEY(st),CONSTRAINT procurator FOREIGN KEY (curator) REFERENCES users ON DELETE NO ACTION ON UPDATE CASCADE, CONSTRAINT prosender FOREIGN KEY (sender) REFERENCES users ON DELETE NO ACTION';
	for (my $i=0; $i<scalar @loci; $i++){
	    $qry.=",CONSTRAINT pro$loci[$i] FOREIGN KEY ($loci[$i]) REFERENCES $loci[$i] ON DELETE NO ACTION ON UPDATE CASCADE";
	}
	$qry.=');';
	print "Creating profiles table ...\n";
	$db->do($qry) and print "Table 'profiles' created.\n" or die "Couldn't create table 'profiles'!\n";
	$"=',';
	$qry="CREATE UNIQUE INDEX profindex ON profiles (@loci);"; 
	$db->do($qry);
	$db->do("GRANT SELECT ON profiles TO apache");
	$db->do("GRANT SELECT ON profiles TO remote");
	$db->disconnect;
	updatedb($newdbase);
    }
} else {
    print "Cannot retrieve database description file from pubmlst.org.\n";
    print "Exiting.\n";
    exit(1);
}

#Check if database exists on system
sub dbexists {
    (my $newdbase)=@_;
    my $exists=0;
    foreach my $dbase (@dbases){
	if ($newdbase eq $dbase){
	    $exists=1;
	}
    }
    return $exists;
}

#Download allele sequences and profiles and update database.  Warn if there
#are changes unless -f option is in place.
sub updatedb {
    (my $dbase)=@_;
    #Check if database exists on system.
    if (!&dbexists($dbase)){
	print "Database '$dbase' is not set up on your system!\n";
	exit(1);
    }
    #Determine which config to use
    my $code='';
    if ($dbase=~/(.*)$dbasename/){
	$code=$1;
    }
    my $index=-1;
    for (my $i=0; $i<scalar @dir; $i++){
	if ($code eq $dir[$i]){
	    $index=$i;
	}
    }
    if ($index == -1){
	print "Configuration for database '$dbase' does not exist on pubmlst.org.  Unable to continue.\n";
	exit(1);
    }
    my $db=DBI->connect("DBI:Pg:dbname=$dbase",'postgres') or die "couldn't open database $dbase";
    my @temp=split /;/, $data[$index];
    foreach my $line (@temp){
	if ($line=~/locus\s*=\s*(.+)\s*,/){
	    my $locus=$1;
	    if (!$opts{'q'}){
		print "Downloading sequences for locus $locus ...\n";
	    }
	    if (my $seqfile=get ($webroot."alleles/$code/$locus.tfa")){
		my @seqs=split />/,$seqfile;
		my $qry="SELECT sequence FROM $locus WHERE id=?";
		my $sql=$db->prepare($qry) or die "couldn't prepare";
		my $qry2="UPDATE $locus SET sequence=?,sender=1,curator=1 WHERE id=?";
		my $sql2=$db->prepare($qry2) or die "couldn't prepare";
		my $qry3="INSERT INTO $locus VALUES (?,1,'today',1,?)";
		my $sql3=$db->prepare($qry3) or die "couldn't prepare";
		foreach my $seq (@seqs){
		    my ($allele,$id);
		    if ($seq=~/$locus-?(\d+)(\D*)/){
			$id=$1;
			$allele=$2;
			$allele=~s/\s//g;
			#check if allele is already in the database
			$sql->execute($id);
			(my $dbseq)=$sql->fetchrow_array();
			if ($dbseq){
			    if ($dbseq ne $allele){
				print "The downloaded $locus$id sequence is different to the local copy.\n"; 
				if (!$opts{'f'}){
				    print "Overwrite with new sequence? [y/n] ";
				    my $response;
				    chomp ($response=<STDIN>);
				    if (lc($response eq 'y') or lc($response eq 'yes')){
					$sql2->execute($allele,$id);
				    }
				} else {
				    print "Overwriting! ...\n";
				    $sql2->execute($allele,$id);
				}				if (!$opts{'f'}){
				    print "Overwrite with new sequence? [y/n] ";
				    my $response;
				    chomp ($response=<STDIN>);
				    if (lc($response eq 'y') or lc($response eq 'yes')){
					$sql2->execute($allele,$id);
				    }
				} else {
				    print "Overwriting! ...\n";
				    $sql2->execute($allele,$id);
				}
			    }
			} else {
			    #sequence not in database
			    if (!$opts{'q'}){
				print "Adding $locus-$id ...\n";
			    }
			    $sql3->execute($id,$allele);
			}
		    }
		}
	    } else {
		print "Could not download sequences for locus $locus.  Unable to continue.\n";
		$db->disconnect;
		exit(1);
	    }
	}
    }
    #Download profiles
    if (!$opts{'q'}){
	print "Downloading profiles ...\n";
    }
    if (my $profiles=get ($webroot."profiles/$code.txt")){
	my $clonal_complex='';
	my @loci;
	my $sql;
	my @proflist=split /\n/,$profiles;
	foreach my $line (@proflist){
	    $line=~s/\r//g;
	    my @values=split /\t/,$line;
	    if ($values[0]=~/\d+/){  #not the header line
		#check if ST is already in the database
		$sql->execute($values[0]);
		my @dbvalues=$sql->fetchrow_array();
		my $profdiff=0;
		my $complexdiff=0;
		undef my @downprof;
		undef my @dbprof;
		my $cols=scalar @values;
		if ($dbvalues[0]){
		    if ($clonal_complex){
			$cols--;
		    }
		    for (my $i=1; $i<$cols; $i++){
			push @downprof,$values[$i];
			push @dbprof,$dbvalues[$i];
			if ($values[$i] ne $dbvalues[$i]){
			    $profdiff=1;
			}
		    }
		    if ($profdiff){
			$"=',';
			print "Profile discrepancy ST-$values[0]: local db (@dbprof): downloaded (@downprof)\n";
		    }
		    if ($values[$cols] ne $dbvalues[$cols]){
			$complexdiff=1;
		    }
		    if ($clonal_complex and $complexdiff){
			print "Complex discrepancy ST-$values[0]: local db ($dbvalues[$cols]): downloaded ($values[$cols])\n";
		    }
		    if ($profdiff or $complexdiff){
			if (!$opts{'f'}){
			    print "Overwrite with new values? [y/n] ";
			    my $response;
			    chomp ($response=<STDIN>);
			    if (lc($response eq 'y') or lc($response eq 'yes')){
				$db->do("DELETE FROM profiles WHERE st='$values[0]'");
				$"="','";
				$db->do("INSERT INTO PROFILES VALUES ('@values',1,1,'today')");
			    }
			} else {
			    print "Overwriting! ...\n";
			    $db->do("DELETE FROM profiles WHERE st='$values[0]'");
			    $"="','";
			    $db->do("INSERT INTO PROFILES VALUES ('@values',1,1,'today')");
			}
		    }
		} else {
		    if (!$opts{'q'}){
			$"=',';
			print "Adding ST-$values[0] (@values) ...\n";
		    }
		    $"="','";
		    $db->do("INSERT INTO profiles VALUES ('@values',1,1,'today')");
		}
	    } else {
		#Header line
		$clonal_complex=pop @values;
		shift @values;
		@loci=@values;
		$"=',';
		my $qry="SELECT * FROM profiles WHERE st=?";
		$sql=$db->prepare($qry) or die "couldn't prepare";	
	    }
	}
    } else {
	print "Could not download profiles.  Unable to continue.\n";
    }	
    $db->disconnect;
}


