#!/usr/bin/perl -w # # $Id$ # # Copyright 2003-2008 Aaron Mackey # # This file is part of BioSQL. # # BioSQL is free software: you can redistribute it and/or modify it # under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # BioSQL is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with BioSQL. If not, see . =head1 NAME load_ncbi_taxonomy.pl =head1 SYNOPSIS Usage: load_ncbi_taxonomy.pl --dbname # name of database to use --dsn # the DSN of the database to connect to --driver # "mysql", "Pg", "Oracle", "SQLite" (default "mysql") --host # optional: host to connect with --port # optional: port to connect with --dbuser # optional: user name to connect with --dbpass # optional: password to connect with --download # optional: whether to download new NCBI taxonomy data --directory # optional: where to store/look for the data --schema # optional: Pg only, load using given schema =head1 DESCRIPTION This script loads or updates a biosql schema with the NCBI Taxon Database. There are a number of options to do with where the biosql database is (i.e., database name, hostname, user for database, password, database name). This script may download the NCBI Taxon Database from the NCBI FTP server on-the-fly (ftp://ftp.ncbi.nih.gov/pub/taxonomy/). Otherwise it expects the files to be downloaded already. You can use this script to load taxon data into a fresh instance of biosql, or to update the taxon content of an already populated biosql database. Because it updates taxon nodes rather than dumping and re-inserting them, bioentries referencing those existing taxon nodes are unaffected. An update will erase all changes you made on taxon nodes and their names which have an NCBI TaxonID set. Names of nodes that do not have an NCBI TaxonID will be left untouched. Note that we used to have the convention to re-use the NCBI TaxonID as the primary key of the taxon table, but as of BioSQL v1.0.1 that is not true anymore. If you happen to rely on that former behavior in your code, you will need to add a post-processing step to change the primary keys to the NCBI taxon IDs. =head1 ARGUMENTS =over =item --dbname name of database to use =item --sid synonym for --dbname for Oracle folks =item --dsn the DSN of the database to connect to, overrides --dbname, --driver, --host, and --port. The default is the value of the DBI_DSN environment variable. =item --driver the DBD driver, one of mysql, Pg, or Oracle. If your driver is not listed here, use --dsn instead. =item --host optional: host to connect with =item --port optional: port to connect with =item --dbuser optional: user name to connect with. The default is the value of the DBI_USER environment variable. =item --dbpass optional: password to connect with. The default is the value of the DBI_PASSWORD environment variable. =item --schema The schema under which the BioSQL tables reside in the database. For Oracle and MySQL this is synonymous with the user, and will not have an effect. PostgreSQL since v7.4 supports schemas as the namespace for collections of tables within a database. =item --download optional: whether to download new NCBI taxonomy data, default is no download =item --directory optional: where to store/look for the data, default is ./taxdata =item --nodelete Flag meaning do not delete retired nodes. You may want to specify this if you have sequence records referencing the retired nodes if they happen to be leafs. Otherwise you will get a foreign key constraint failure saying something like 'child record found' if there is a bioentry for that species. The retired nodes will still be printed, so that you can then decide for yourself afterwards what to do with the bioentries that reference them. =item --verbose=n Sets the verbosity level, default is 1. 0 = silent, 1 = print current step, 2 = print current step and progress statistics. =item --help print this manual and exit =item --allow_truncate Flag to allow for non-transactional TRUNCATE. This presently applies only to deleting and re-loading taxon names table. The script will attempt to perform the much faster TRUNCATE operation instead of a DELETE. Some RDBMSs, like PostgreSQL, however prohibit TRUNCATE from within a transactions, because they cannot roll it back. If this flag is specified, the TRUNCATE will still be performed, but then outside of a transaction. This means that between the this operation is done until the names have been fully loaded there will be no or only partial taxon names for querying, leading to inconsistent or incomplete answers to queries. This is therefore disabled by default. Note though that for instance in PostgreSQL TRUNCATE is several orders of magnitude faster. =item --chunksize The number of rows after which to commit and possibly recompute statistics. This presently only applies to the nested set rebuild phase. It tries to address the potentially marked performance degradation in PostgreSQL while updating the taxon rows. The downside of this approach is that because computing statistics in PostgreSQL cannot run within a transaction, partially rebuilt nested set values have to be committed at regular intervals. You can disable the chunked commits by supplying a value of 0. If you run on PostgreSQL and you are not sure about the performance win, try --chunksize=0 --verbose=2. Watch the performance statistics during the nested set rebuild phase. If you see a marked decrease in rows/s over time down to values significantly below 100 rows/s, you may want to run a chunked rebuild. Otherwise keep it disabled. For database and query consistency disabling it is generally preferrable. The default presently is to disable it. A suitable value for PostgreSQL according to test runs would be 40,000. =back =head1 Authors Aaron Mackey Eamackey at virginia.eduE =head1 Contributors Hilmar Lapp Ehlapp at gmx.netE =cut use strict; use DBI; use POSIX; use Getopt::Long; use LWP::UserAgent; #################################################################### # Global defaults or definitions, mostly changeable through commandline #################################################################### my $help = 0; # whether to display the help page my $db; # the name of the database or schema my $dsn = $ENV{DBI_DSN}; # the full DSN -- will be built if not provided my $host; # host name of the server my $port; # port to which to connect my $user = $ENV{DBI_USER}; # the user to connect as my $pass = $ENV{DBI_PASSWORD}; # the password for the user our $driver; # the DBI driver module my $schema; # for PostgreSQL, the schema to use, if any my $dir = "taxdata"; # the download and data directory my $download = 0; # whether to download from NCBI first our $allow_truncate = 0; # whether or not to allow the names delete and reload # to span more than one transaction my $pgchunk = 40000; # the number of rows after which to vacuum in the # nested set rebuilding phase our $chunksize = 0; # disable by default our $verbose = 1; # guess what our $nodelete = 0; # whether not to delete retired taxon nodes # not changeable through command-line: my %tablemaps = ( "mysql" => { "taxon" => "taxon", "taxon_name" => "taxon_name", }, "Pg" => { "taxon" => "taxon", "taxon_name" => "taxon_name", }, "Oracle" => { "taxon" => "bs_taxon", "taxon_name" => "bs_taxon_name", # we can't truncate on a view ... "taxon_name_table" => "taxon_name", }, "SQLite" => { "taxon" => "taxon", "taxon_name" => "taxon_name", }, ); my %tablemap; #################################################################### # end of global defaults #################################################################### my $ok = GetOptions("help" => \$help, "dbname=s" => \$db, "sid=s" => \$db, "database=s" => \$db, "dsn=s" => \$dsn, "host=s" => \$host, "port=i" => \$port, "user=s" => \$user, "dbuser=s" => \$user, "password=s" => \$pass, "dbpass=s" => \$pass, "driver=s" => \$driver, "schema=s" => \$schema, "allow_truncate" => \$allow_truncate, "chunksize=i"=> \$chunksize, "directory=s"=> \$dir, "download" => \$download, "nodelete" => \$nodelete, "verbose=i" => \$verbose, ); # # erroneous arguments or help page requested? # if($help || (!$ok)) { system("perldoc $0"); exit($ok ? 0 : 2); } # # setup / massage / sanity-check arguments # # download directory: if($dir) { mkdir $dir unless -e $dir; # remove trailing directory separator, if necessary: $dir =~ s!/$!!; } # database name: die "Must supply --dbname argument!\n" unless $db; # build DSN if not provided if (!$dsn) { $driver = "mysql" unless $driver; $dsn = "dbi:$driver:"; my $dbarg = "dbname="; SWITCH: { if ($driver eq "mysql") { $dbarg = "database="; last SWITCH; } if ($driver eq "Oracle") { $dbarg = ($host || $port) ? "sid=" : ""; last SWITCH; } } $dsn .= $dbarg.$db; $dsn .= ";host=$host" if $host; $dsn .= ";port=$port" if $port; } # # go get the files we need if download requested # if ($download) { print STDERR "Downloading NCBI taxon database to $dir\n" if $verbose; download_taxondb($dir); } # # now connect and setup the SQL statements # my $dbh = DBI->connect($dsn, $user, $pass, { RaiseError => 0, AutoCommit => 1, PrintError => 0, } ) or die $DBI::errstr; # recover the driver if it wasn't specified on the command line $driver = $dbh->{Driver}->{Name} if $dsn && !$driver; # if this is PostgreSQL and a schema was named, make sure it's in the # search path if (($driver eq "Pg") && $schema) { $dbh->do("SET search_path TO $schema, public") or die $DBI::errstr; } # Set foreign keys to be on if the schema is SQLite if ($driver eq "SQLite") { $dbh->do("PRAGMA foreign_keys = ON") or die $DBI::errstr; } # chunksize: if(! defined($chunksize)) { $chunksize = ($driver eq "Pg") ? $pgchunk : 0; } # tablemap: if(exists($tablemaps{$driver})) { %tablemap = %{$tablemaps{$driver}}; } else { # let's use mysql mapping as the default %tablemap = %{$tablemaps{"mysql"}}; } my $taxontbl = $tablemap{taxon}; my $taxonnametbl = $tablemap{taxon_name}; my %sth = ( # # insert/update/delete taxon nodes # add_tax => 'INSERT INTO '.$taxontbl.' (ncbi_taxon_id, parent_taxon_id, node_rank, genetic_code, mito_genetic_code) VALUES (?, ?, ?, ?, ?)' , upd_tax => 'UPDATE '.$taxontbl.' SET parent_taxon_id = ?, node_rank = ?, genetic_code = ?, mito_genetic_code = ? WHERE taxon_id = ?' , del_tax => 'DELETE FROM '.$taxontbl.' WHERE ncbi_taxon_id = ?' , upd_tax_parent => 'UPDATE '.$taxontbl.' SET parent_taxon_id = ? WHERE taxon_id = ?' , # # insert/update/delete taxon names # add_taxname => 'INSERT INTO '.$taxonnametbl.' (taxon_id, name, name_class) VALUES (?, ?, ?)' , upd_taxname => # this is actually not used presently 'UPDATE '.$taxonnametbl.' SET taxon_id, name = ?, name_class = ? WHERE id = ?' , del_taxname => # this is actually not used presently 'DELETE FROM '.$taxonnametbl.' WHERE id = ?' , # # rebuild the nested set values # get_children => 'SELECT taxon_id, left_value, right_value FROM '.$taxontbl.' WHERE parent_taxon_id = ? ORDER BY ncbi_taxon_id' , set_nested_set => 'UPDATE '.$taxontbl.' SET left_value = ?, right_value = ? WHERE taxon_id = ?' , unset_nested_set => $driver eq "mysql" ? # Mysql sometimes is horribly broken. The statement that works for # everybody else is horribly slow in MySQL because it does a # full table scan. Ugh. ['UPDATE '.$taxontbl.' SET left_value = NULL, right_value = NULL WHERE left_value = ?', 'UPDATE '.$taxontbl.' SET left_value = NULL, right_value = NULL WHERE right_value = ?'] : 'UPDATE '.$taxontbl.' SET left_value = NULL, right_value = NULL WHERE left_value = ? OR right_value = ?' , ); # prepare all our statements @sth{keys %sth} = map { ref($_) ? [map { $dbh->prepare($_); } @$_] : $dbh->prepare($_); } values %sth; # install the exit handler END { end_work($driver, $dbh); } my @new; my @old; my ($ins, $upd, $del, $nas); print STDERR "Loading NCBI taxon database in $dir:\n" if $verbose; ##### enter the taxonomy nodes: print STDERR "\t... retrieving all taxon nodes in the database\n" if $verbose; # retrieve all nodes currently in the database @old = @{ $dbh->selectall_arrayref( 'SELECT taxon_id, ncbi_taxon_id, parent_taxon_id, node_rank, '. 'genetic_code, mito_genetic_code '. 'FROM '.$taxontbl.' ORDER BY ncbi_taxon_id' ) || [] }; print STDERR "\t... reading in taxon nodes from nodes.dmp\n" if $verbose; # slurp in all nodes from the input nodes dump open(TAX, "<$dir/nodes.dmp") or die "Couldn't open data file $dir/nodes.dmp: $!\n"; while () { my @row = split(/\s*\|\s*/o, $_); my $rec = [ @row[0, 0..2, 6, 8] ]; # only keep if we have values here (apparently sometimes we do not) next unless grep { defined($_) && (length($_) > 0) } @$rec; push @new, $rec; } close(TAX); print STDERR "\t... insert / update / delete taxon nodes\n" if $verbose; # we also try to minimize the parent updates by mapping NCBI # taxonIDs to primary keys for the nodes we already have my $nodesToUpdate = map_parent_ids(\@old, \@new); # start transaction, possibly lock tables, etc. begin_work($driver, $dbh); # taxon has a self-referential foreign key, which we need to defer, remove, # or whatever unconstrain_taxon($driver, $dbh); ($ins, $upd, $del, $nas) = handle_diffs(\@old, \@new, sub { return $sth{add_tax}->execute(@_[1..5]) }, sub { return $sth{upd_tax}->execute(@_[2..5,0]) }, sub { return $sth{del_tax}->execute(@_[1..1]) } ); # to avoid having to look up the primary key for NCBI taxonIDs, we'll # create a map of NCBI taxonIDs to primary keys to speed things up my %ncbiIDmap = map { ($_->[1], $_->[0]); } @new; # carry out the parent ID updates we determined earlier print STDERR "\t... updating new parent IDs\n" if $verbose; my $n = 0; my $time = time(); foreach my $nodeRec (@$nodesToUpdate) { if (!$sth{upd_tax_parent}->execute($ncbiIDmap{$nodeRec->[2]}, $nodeRec->[0])) { die "failed to update parent ID for node (".join(";",@{$nodeRec}). "): ".$dbh->errstr; } # to avoid gotcha's later in the flow, let's also correct the # value in the in-memory record $nodeRec->[2] = $ncbiIDmap{$nodeRec->[2]}; handle_progress($dbh, \$time, ++$n); } # # Because the commit will enforce the deferred foreign key constraint on # parent, it may actually take a while. Therefore, let's indicate what's # holding us up. # print STDERR "\t... (committing nodes)\n" if $verbose; end_work($driver,$dbh,1); # # if this is Postgresql, we need to vacuum analyze here, otherwise the # following updates will be hideously slow # if($driver eq "Pg") { print STDERR "\t... (vacuuming)\n" if $verbose; $dbh->do("VACUUM ANALYZE taxon"); } elsif($driver eq "SQLite") { print STDERR "\t... (vacuuming)\n" if $verbose; $dbh->do("VACUUM"); $dbh->do("ANALYZE taxon"); } # in case un-constraining it required some special action constrain_taxon($driver,$dbh); ##### rebuild the nested set left/right id': print STDERR "\t... rebuilding nested set left/right values\n" if $verbose; begin_work($driver, $dbh); $time = time(); # this is for progress timing handle_subtree($new[0]->[0]); # this relies on @new being ordered by NCBI ID end_work($driver,$dbh,1); ##### enter the taxonomy names: print STDERR "\t... reading in taxon names from names.dmp\n" if $verbose; open(NAMES, "<$dir/names.dmp") or die "Couldn't open data file $dir/names.dmp: $!\n"; begin_work($driver,$dbh); # delete all names for taxon nodes with a NCBI taxonID print STDERR "\t... deleting old taxon names\n" if $verbose; delete_ncbi_names($driver, $dbh, $taxontbl, $taxonnametbl, $driver eq "Oracle" ? $tablemap{taxon_name_table} : undef); # now add the new taxon names from the download print STDERR "\t... inserting new taxon names\n" if $verbose; # go through the names file and insert one row at a time $n = 0; $time = time(); while () { my @data = split(/\s*\|\s*/o, $_); $sth{add_taxname}->execute($ncbiIDmap{$data[0]},@data[1, 3]); handle_progress($dbh, \$time, ++$n); } close(NAMES); print STDERR "\t... cleaning up\n" if $verbose; end_work($driver,$dbh,1); # clean up statement/database handles: for my $sth (values %sth) { my @stmts = ref($sth) eq "ARRAY" ? @$sth : ($sth); foreach (@stmts) { $_->finish() if ref($_) && $_->{Active}; } } $dbh->disconnect(); print STDERR "Done.\n" if $verbose; { my $nodectr = 0; sub handle_subtree { my ($id,$left,$right) = @_; my $left_value = ++$nodectr; $left = -1 unless $left; $right = -1 unless $right; $sth{get_children}->execute($id); for my $child ( @{$sth{get_children}->fetchall_arrayref()} ) { handle_subtree(@$child) unless $child->[0] == $id; } my $right_value = ++$nodectr; if(($left != $left_value) || ($right != $right_value)) { # if this is an update run, we can't just update to any number we # think is right, because another node that we haven't reached # yet for update may carry this value (left_value and right_value # are constrained for uniqueness) if(($driver eq "mysql") && ref($sth{unset_nested_set})) { # ugly mysql $sth{unset_nested_set}->[0]->execute($left_value); $sth{unset_nested_set}->[1]->execute($right_value); } else { $sth{unset_nested_set}->execute($left_value, $right_value); } if(!$sth{set_nested_set}->execute($left_value, $right_value, $id)) { die "update of nested set values failed (taxonID: $id): ". $sth{set_nested_set}->errstr; } } handle_progress($dbh, \$time, floor($nodectr/2), undef, $chunksize); } } sub map_parent_ids { my ($old,$new) = @_; # we accumulate a list of node records which we will need to # update to use the primary key instead my @needs_upd = (); # we try to minimize the parent updates by mapping NCBI # taxonIDs to primary keys for the nodes we already have my %ncbiIDmap = map { ($_->[1], $_->[0]); } @$old; foreach my $rec (@$new) { # if we have it, we know that the primary key isn't going to # change, and hence we can map the parent NCBI ID right away if (exists($ncbiIDmap{$rec->[2]})) { $rec->[2] = $ncbiIDmap{$rec->[2]}; } else { # if we don't have it yet, we don't know yet what the # primary key is going to be, and hence need to update # after we inserted the all the nodes push (@needs_upd, $rec); } } return \@needs_upd; } sub handle_diffs { my ($old, $new, $insert, $update, $delete) = @_; my ($is, $ds, $us, $na) = (0, 0, 0, 0); # we assume $old is already sorted (came from database). # we also assume that $old and $new are both arrays of array # references, the first elements of which are the unique id's # we sort $new by NCBI taxonID: @$new = sort { $a->[1] <=> $b->[1] } @$new; my $time = time(); my ($o, $n) = (0, 0); my ($odone, $ndone) = (0, 0); $odone++ unless @$old; $ndone++ unless @$new; while ($o < @$old || $n < @$new) { handle_progress($dbh, \$time, $n, scalar(@$new)); if ($odone) { # only new's left to add if($insert->(@{$new->[$n]})) { # propagate new primary key to the in-memory array # of new taxon nodes $new->[$n]->[0] = last_insert_id($dbh,$taxontbl,$schema); } else { die "failed to insert node (".join(";",@{$new->[$n]}). "): ".$dbh->errstr; } $is++; $n++; } elsif ($ndone) { # only old's left to remove if ($nodelete || (!$delete->(@{$old->[$o]}))) { print STDERR "note: node (". join(";",map { defined($_) ? $_ : ""; } @{$old->[$o]}). ") is retired" if $verbose || (!$nodelete); if (!$nodelete) { # SQL statement failed print STDERR "; failed to delete: ".$dbh->errstr; } print STDERR "\n" if $verbose || (!$nodelete); } $ds++; $o++; } else { # both $o and $n are still valid my ($oldentry, $newentry) = ($old->[$o], $new->[$n]); if ($oldentry->[1] == $newentry->[1]) { # same taxon ID: we may need to update # copy primary key: $newentry->[0] = $oldentry->[0]; # and propagate to the in-memory array of new taxon nodes $new->[$n]->[0] = $oldentry->[0]; # make sure entry data are identical, otherwise update: my $ok = 1; CHECK : for my $i (1 .. @$oldentry-1) { unless ( (defined($oldentry->[$i]) && defined($newentry->[$i]) && $oldentry->[$i] eq $newentry->[$i]) || (!defined($oldentry->[$i]) && !defined($newentry->[$i])) ) { $ok = 0; last CHECK; } } unless ($ok) { if(!$update->(@{$newentry})) { die "failed to update node (".join(";",@{$newentry}). "): ".$dbh->errstr; } $us++; } else { $na++; } $o++; $n++; } elsif ($oldentry->[1] < $newentry->[1]) { # old entry to be removed if ($nodelete || (!$delete->(@{$oldentry}))) { print STDERR "note: node (". join(";",map { defined($_) ? $_ : ""; } @{$oldentry}). ") is retired" if $verbose || (!$nodelete); if (!$nodelete) { # SQL statement failed print STDERR "; failed to delete: ".$dbh->errstr; } print STDERR "\n" if $verbose || (!$nodelete); } $ds++; $o++; } else { # new entry to be added if($insert->(@{$newentry})) { # propagate new primary key to the in-memory array # of new taxon nodes $new->[$n]->[0] = last_insert_id($dbh,$taxontbl,$schema); } else { die "failed to insert node (".join(";",@{$newentry}). "): ".$dbh->errstr; } $is++; $n++; } } if ($o == @$old) { $odone++; } if ($n == @$new) { $ndone++; } } return ($is, $us, $ds, $na); } sub download_taxondb{ my $dir = shift; my $ua = new LWP::UserAgent(agent => 'Mozilla/5.0'); $ua->proxy(['http', 'ftp'] => $ENV{HTTP_PROXY}); my $req = HTTP::Request->new('GET',"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz"); $ua->request($req,$dir."/taxdump.tar.gz") or die("Error downloading file"); # unpack them; overwrite previous files, if necessary system("gunzip -f $dir/taxdump.tar.gz"); system("cd $dir ; tar -xf taxdump.tar ; rm -f taxdump.tar"); } sub delete_ncbi_names{ my ($driver,$dbh,$taxontbl,$taxonnametbl) = @_; $taxontbl = "taxon" unless $taxontbl; $taxonnametbl = "taxon_name" unless $taxonnametbl; # # We purge all taxon names first that belong to NCBI taxa, followed by # inserting the new names from scratch. # # TRUNCATE table in most RDBMSs is considerably faster than DELETE. If # the taxon data hasn't been tampered with or added to, we could get # away with a TRUNCATE. # my $truncsql = "TRUNCATE TABLE ".$taxonnametbl; my $delsql = 'DELETE FROM '.$taxonnametbl.' WHERE taxon_id IN ('. 'SELECT taxon_id FROM '.$taxontbl.' t '. 'WHERE t.ncbi_taxon_id IS NOT NULL)'; # Check which delete path we need to (can) take. Note that with Pg # our hands are tied. my $purgesql; if((!$allow_truncate) && (($driver eq "Pg") || ($driver eq "Oracle")) || ($driver eq "SQLite")) { $purgesql = $delsql; } else { my $row = $dbh->selectall_arrayref('SELECT COUNT(*) FROM '. $taxontbl. ' WHERE ncbi_taxon_id IS NULL'); # need the full DELETE query? $purgesql = ($row && @$row && $row->[0]->[0]) ? $delsql : $truncsql; } # If this is the full DELETE query, we'll just blast ahead assuming that # the RDBMS can do subqueries. If it blows up, we'll deal with that later. my ($rv,$opentransaction); for(;;) { eval { $rv = $dbh->do($purgesql); }; if((!$rv) && ($dbh->errstr =~ /transaction/i)) { # this is probably some RDBMS that wants to run truncate outside # of a transaction, because it can't roll it back end_work($driver, $dbh, 0); if(!$allow_truncate) { # resort to a full delete; this allows it to run within a # transaction $purgesql = $delsql; begin_work($driver, $dbh); } else { # indicate that we need to re-open the transaction afterwards $opentransaction = 1; } next; } last; # exit otherwise, as there is no simple help to make this succeed } if($@ || (!$rv)) { # This must be MySQL still being in the 19th century of RDBMSs. # 25 years after everyone else they implemented subqueries - but this # version may not be the latest. if($driver ne "mysql") { die "unexpected failure when trying to delete ". "existing taxon names (query was '$purgesql'):\n". $dbh->errstr(); } # save the ones we want to keep $dbh->do('CREATE TEMPORARY TABLE tname_temp AS '. 'SELECT tnm.* '. 'FROM '.$taxonnametbl.' tnm, '.$taxontbl.' tn '. 'WHERE tnm.taxon_id = tn.taxon_id '. 'AND tn.ncbi_taxon_id IS NULL'); # delete all if($driver ne 'SQLite') { $dbh->do('TRUNCATE TABLE '.$taxonnametbl); } else { $dbh->do('DELETE FROM '.$taxonnametbl); } # restore the saved ones $dbh->do('INSERT INTO '.$taxonnametbl.' SELECT * FROM tname_temp'); # whew! isn't there an easier way? } elsif($opentransaction) { begin_work($driver,$dbh); } } sub handle_progress{ my ($dbh, $time, $n, $total, $commit) = @_; our $last_n = 0 if (!defined($last_n)) || ($n < $last_n); if($n && ($n - 20000 >= $last_n)) { my $elapsed = time() - $$time; if($verbose > 1) { my $fmt = "\t\t%d"; $fmt .= "/%d" if $total; $fmt .= " done (in %d secs, %4.1f rows/s)\n"; if($total) { printf STDERR $fmt, $n, $total, $elapsed, ($n-$last_n)/($elapsed||1); } else { printf STDERR $fmt, $n, $elapsed, ($n-$last_n)/($elapsed||1); } } if(defined($commit) && ($commit > 0) && (($n % $commit) <= ($last_n % $commit))) { end_work($driver, $dbh, 1); if($driver eq "Pg") { print STDERR "\t\t(vacuuming)\n" if $verbose; $dbh->do("VACUUM ANALYZE taxon"); } begin_work($driver, $dbh); } $$time = time() if $elapsed; $last_n = $n; } } sub constrain_taxon{ my ($driver,$dbh) = @_; # The deal is that we get here *after* a commit or rollback, so the # transaction in which we deferred or disabled FK checking is already # terminated. All we need to take care of here is therefore # re-establishing the constraints we removed before in Pg if($driver eq "Pg") { print STDERR "\t... (re-constraining taxon)\n" if $verbose; $dbh->do('SELECT constrain_taxon()'); # we ignore a possible failure -- maybe we should at least make # some noise about it? } } sub unconstrain_taxon{ my ($driver,$dbh) = @_; # if this is MySQL we need to temporarily disable foreign key constraint # checking because MySQL can't defer foreign key validation - ugly # (and potentially dangerous as I guess cascading deletes will be disabled # while this is in effect) if($driver eq "mysql") { #if(!$dbh->do('SET FOREIGN_KEY_CHECKS=0')) { # warn "failed to disable foreign key checks: ".$dbh->errstr; #} } # if this is PostgreSQL, it can defer foreign key constraints, but it # is for some reason still incredibly slow, with performance degrading # rapidly and steeply during the upload. We need to remove the foreign # key constraint temporarily. Ugly. elsif($driver eq "Pg") { if(!$dbh->do('SELECT unconstrain_taxon()')) { warn "failed to un-constrain taxon: ".$dbh->errstr; end_work($driver,$dbh,0); begin_work($driver,$dbh); $dbh->do('SET CONSTRAINTS ALL DEFERRED'); } } # otherwise let's assume we're fine with just deferring FK constraints else { # turn on deferrable $dbh->do('SET CONSTRAINTS ALL DEFERRED'); } } sub begin_work{ my ($driver, $dbh, $drop_fk) = @_; $dbh->begin_work() if $dbh->{AutoCommit}; if ($driver eq "mysql") { # lock all the tables we'll need, if MySQL: # do we really need this? #my @locktables = qw(taxon taxon_name); #$dbh->do('LOCK TABLES ' . # join(", ", map { $_ .= ' WRITE' } @locktables)); } } sub end_work{ my ($driver, $dbh, $commit) = @_; # skip if $driver or $dbh not set up yet return unless $driver && $dbh && $dbh->{Active}; if ($driver eq "mysql") { # make sure unsetting this is reverted #$dbh->do('SET FOREIGN_KEY_CHECKS=1'); # unlock all the tables, if MySQL: $dbh->do('UNLOCK TABLES'); } # end the transaction my $rv = $commit ? $dbh->commit() : $dbh->rollback(); if(!$rv) { print STDERR ($commit ? "commit " : "rollback "). "failed: ".$dbh->errstr; } $dbh->disconnect() unless defined($commit); } =head2 last_insert_id Title : last_insert_id Usage : $id = last_insert_id($dbh,$table,$schema) Function: Obtain the last primary key that was generated by the database in the current session (as represented by the database handle). This implementation will use a special database handle attribute for MySQL and the more recent last_insert_id() method in DBI otherwise. Hence, it is not much different from, and in fact for all drivers other than MySQL identical to, calling the DBI method directly, and may hence be removed in the future. Returns : A scalar Args : The DBI database handle (mandatory). The name of the table into which the record was inserted (depends on RDBMS: ignored for MySQL, optional for Pg, for example). The name of the schema (depends on RDBMS: ignored by MySQL, required by Pg only if a schema other than public is used). =cut sub last_insert_id { my ($dbh,$table_name,$schema) = @_; my $driver = $dbh->{Driver}->{Name}; if (lc($driver) eq 'mysql') { return $dbh->{'mysql_insertid'}; } else { return $dbh->last_insert_id(undef,$schema,$table_name,undef); } }