#!/usr/bin/env python """Automatically install required tools and data to run bcbio-nextgen pipelines. This automates the steps required for installation and setup to make it easier to get started with bcbio-nextgen. The defaults provide data files for human variant calling. Requires: git, wget, bgzip2, Python 3 or 2.7 """ from __future__ import print_function import argparse import collections import contextlib import datetime import os import platform import shutil import subprocess import sys try: import urllib2 as urllib_request except ImportError: import urllib.request as urllib_request REMOTES = { "requirements": "https://raw.githubusercontent.com/bcbio/bcbio-nextgen/master/requirements-conda.txt", "gitrepo": "https://github.com/bcbio/bcbio-nextgen.git", "system_config": "https://raw.githubusercontent.com/bcbio/bcbio-nextgen/master/config/bcbio_system.yaml", "anaconda": "https://repo.anaconda.com/miniconda/Miniconda3-py37_4.10.3-%s-x86_64.sh" } def main(args, sys_argv): check_arguments(args) check_dependencies() with bcbio_tmpdir(): setup_data_dir(args) print("Installing isolated base python installation") anaconda = install_anaconda_python(args) if args.use_mamba: conda_bin = "mamba" else: conda_bin = "conda" print(f"Installing {conda_bin}") anaconda = install_mamba(anaconda, args) print("Installing conda-build") subprocess.check_call([anaconda[conda_bin], "install", "--yes", "conda-build", "mamba=0.24.0"]) print("Installing bcbio-nextgen") bcbio = install_conda_pkgs(anaconda, args) bootstrap_bcbionextgen(anaconda, args) print("Installing data and third party dependencies") system_config = write_system_config(REMOTES["system_config"], args.datadir, args.tooldir) setup_manifest(args.datadir) subprocess.check_call([bcbio, "upgrade"] + _clean_args(sys_argv, args)) print("Finished: bcbio-nextgen, tools and data installed") print(" Genome data installed in:\n %s" % args.datadir) if args.tooldir: print(" Tools installed in:\n %s" % args.tooldir) print(" Ready to use system configuration at:\n %s" % system_config) print(" Edit configuration file as needed to match your machine or cluster") def _clean_args(sys_argv, args): """Remove data directory from arguments to pass to upgrade function remove --mamba""" base = [x for x in sys_argv if x.startswith("-") or not args.datadir == os.path.abspath(os.path.expanduser(x))] # Remove installer only options we don't pass on base = [x for x in base if x not in set(["--minimize-disk"])] if "--nodata" in base: base.remove("--nodata") else: base.append("--data") if "--mamba" in base: base.remove("--mamba") return base def bootstrap_bcbionextgen(anaconda, args): if args.upgrade == "development": git_tag = "@%s" % args.revision if args.revision != "master" else "" subprocess.check_call([anaconda["pip"], "install", "--upgrade", "--no-deps", "git+%s%s#egg=bcbio-nextgen" % (REMOTES["gitrepo"], git_tag)]) def install_mamba(anaconda, args): """ Install conda or mamba""" if args.use_mamba: conda_bin = "mamba" else: conda_bin = "conda" anaconda_dir = os.path.join(args.datadir, "anaconda") bindir = os.path.join(anaconda_dir, "bin") mamba = os.path.join(bindir, conda_bin) subprocess.check_call([anaconda["conda"], "install", "--yes", conda_bin]) anaconda[conda_bin] = mamba return anaconda def install_conda_pkgs(anaconda, args): env = dict(os.environ) # Try to avoid user specific pkgs and envs directories # https://github.com/conda/conda/issues/6748 env["CONDA_PKGS_DIRS"] = os.path.join(anaconda["dir"], "pkgs") env["CONDA_ENVS_DIRS"] = os.path.join(anaconda["dir"], "envs") conda_bin = anaconda["conda"] if "mamba" in anaconda.keys(): mamba_bin = anaconda["mamba"] else: mamba_bin = anaconda["conda"] if not os.path.exists(os.path.basename(REMOTES["requirements"])): subprocess.check_call(["wget", "--no-check-certificate", REMOTES["requirements"]]) if args.minimize_disk: subprocess.check_call([mamba_bin, "install", "--yes", "nomkl"], env=env) subprocess.check_call([mamba_bin, "install", "--yes", "--only-deps", "bcbio-nextgen"], env=env) subprocess.check_call([conda_bin, "install", "--yes", "--file", os.path.basename(REMOTES["requirements"])], env=env) return os.path.join(anaconda["dir"], "bin", "bcbio_nextgen.py") def _guess_distribution(): """Simple approach to identify if we are on a MacOSX or Linux system for Anaconda""" if platform.mac_ver()[0]: return "macosx" else: return "linux" def install_anaconda_python(args): """Provide isolated installation of Anaconda python for running bcbio-nextgen. http://docs.continuum.io/anaconda/index.html """ anaconda_dir = os.path.join(args.datadir, "anaconda") bindir = os.path.join(anaconda_dir, "bin") # insert anaconda/bin the get python and conda from there sys.path.insert(0, bindir) conda = os.path.join(bindir, "conda") if not os.path.exists(anaconda_dir) or not os.path.exists(conda): if os.path.exists(anaconda_dir): shutil.rmtree(anaconda_dir) dist = args.distribution if args.distribution else _guess_distribution() url = REMOTES["anaconda"] % ("MacOSX" if dist.lower() == "macosx" else "Linux") if not os.path.exists(os.path.basename(url)): subprocess.check_call(['wget', '--progress=dot:giga', url]) subprocess.check_call(['bash', os.path.basename(url), '-b', '-p', anaconda_dir]) # conda-forge channel should have the highest priority # https://bioconda.github.io/user/install.html#set-up-channels subprocess.check_call([conda, 'config', '--add', 'channels', 'bioconda', '--file', os.path.join(anaconda_dir, '.condarc')]) subprocess.check_call([conda, 'config', '--add', 'channels', 'conda-forge', '--file', os.path.join(anaconda_dir, '.condarc')]) return {"conda": conda, "pip": os.path.join(bindir, "pip"), "dir": anaconda_dir} def setup_manifest(datadir): """Create barebones manifest to be filled in during update""" manifest_dir = os.path.join(datadir, "manifest") if not os.path.exists(manifest_dir): os.makedirs(manifest_dir) def write_system_config(base_url, datadir, tooldir): """Write a bcbio_system.yaml configuration file with tool information""" out_file = os.path.join(datadir, "galaxy", os.path.basename(base_url)) if not os.path.exists(os.path.dirname(out_file)): os.makedirs(os.path.dirname(out_file)) if os.path.exists(out_file): # if no tool directory and exists, do not overwrite if tooldir is None: return out_file else: bak_file = out_file + ".bak%s" % (datetime.datetime.now().strftime("%Y%M%d_%H%M")) shutil.copy(out_file, bak_file) if tooldir: java_basedir = os.path.join(tooldir, "share", "java") rewrite_ignore = ("log",) with contextlib.closing(urllib_request.urlopen(base_url)) as in_handle: with open(out_file, "w") as out_handle: in_resources = False in_prog = None for line in (l.decode("utf-8") for l in in_handle): if line[0] != " ": in_resources = line.startswith("resources") in_prog = None elif (in_resources and line[:2] == " " and line[2] != " " and not line.strip().startswith(rewrite_ignore)): in_prog = line.split(":")[0].strip() # Update java directories to point to install directory, avoid special cases elif line.strip().startswith("dir:") and in_prog and in_prog not in ["log", "tmp"]: final_dir = os.path.basename(line.split()[-1]) if tooldir: line = "%s: %s\n" % (line.split(":")[0], os.path.join(java_basedir, final_dir)) in_prog = None elif line.startswith("galaxy"): line = "# %s" % line out_handle.write(line) return out_file def setup_data_dir(args): if not os.path.exists(args.datadir): cmd = ["mkdir", "-p", args.datadir] subprocess.check_call(cmd) @contextlib.contextmanager def bcbio_tmpdir(): orig_dir = os.getcwd() work_dir = os.path.join(os.getcwd(), "tmpbcbio-install") if not os.path.exists(work_dir): os.makedirs(work_dir) os.chdir(work_dir) yield work_dir os.chdir(orig_dir) shutil.rmtree(work_dir) def check_arguments(args): """Ensure argruments are consistent and correct""" if args.toolplus and not args.tooldir: raise argparse.ArgumentTypeError("Cannot specify --toolplus without --tooldir") def check_dependencies(): """Ensure required tools for installation are present""" print("Checking required dependencies") for dep, msg in [(["git", "--version"], "Git (http://git-scm.com/)"), (["wget", "--version"], "wget"), (["bzip2", "-h"], "bzip2")]: try: p = subprocess.Popen(dep, stderr=subprocess.STDOUT, stdout=subprocess.PIPE) out, code = p.communicate() except OSError: out = "Executable not found" code = 127 if code == 127: raise OSError("bcbio-nextgen installer requires %s\n%s" % (msg, out)) def _check_toolplus(x): """Parse options for adding non-standard/commercial tools like GATK and MuTecT""" import argparse Tool = collections.namedtuple("Tool", ["name", "fname"]) std_choices = set(["data", "dbnsfp", "ericscript"]) if x in std_choices: return Tool(x, None) elif "=" in x and len(x.split("=")) == 2: name, fname = x.split("=") fname = os.path.normpath(os.path.realpath(fname)) if not os.path.exists(fname): raise argparse.ArgumentTypeError("Unexpected --toolplus argument for %s. " "File does not exist: %s" % (name, fname)) return Tool(name, fname) else: raise argparse.ArgumentTypeError("Unexpected --toolplus argument. " "Expect toolname=filename.") if __name__ == "__main__": parser = argparse.ArgumentParser( description="Automatic installation for bcbio-nextgen pipelines") parser.add_argument("datadir", help="Directory to install genome data", type=lambda x: (os.path.abspath(os.path.expanduser(x)))) parser.add_argument("--cores", default=1, help="Number of cores to use if local indexing is necessary.") parser.add_argument("--tooldir", help="Directory to install 3rd party software tools. " "Leave unspecified for no tools", type=lambda x: (os.path.abspath(os.path.expanduser(x))), default=None) parser.add_argument("--toolplus", help="Specify additional tool categories to install", action="append", default=[], type=_check_toolplus) parser.add_argument("--datatarget", help="Data to install. Allows customization or install of extra data.", action="append", default=[], choices=["variation", "rnaseq", "smallrna", "gemini", "vep", "dbnsfp", "battenberg", "kraken", "ericscript", "gnomad"]) parser.add_argument("--genomes", help="Genomes to download", action="append", default=[], choices=["BDGP6", "canFam3", "dm3", "galGal4", "GRCh37", "GRCz10", "GRCz11", "hg19", "hg38", "hg38-noalt", "mm10", "mm9", "phix", "pseudomonas_aeruginosa_ucbpp_pa14", "rn5", "rn6", "sacCer3", "Sscrofa11.1", "TAIR10", "WBcel235", "xenTro3"]) parser.add_argument("--aligners", help="Aligner indexes to download", action="append", default=[], choices=["bbmap", "bowtie", "bowtie2", "bwa", "hisat2", "minimap2", "novoalign", "rtg", "snap", "star", "ucsc"]) parser.add_argument("--nodata", help="Do not install data dependencies", dest="install_data", action="store_false", default=True) parser.add_argument("--mamba", help="Use mamba instead of conda", dest="use_mamba", action="store_true", default=False) parser.add_argument("--isolate", help="Created an isolated installation without PATH updates", dest="isolate", action="store_true", default=False) parser.add_argument("--minimize-disk", help="Try to minimize disk usage (no MKL extensions)", dest="minimize_disk", action="store_true", default=False) parser.add_argument("-u", "--upgrade", help="Code version to install", choices=["stable", "development"], default="stable") parser.add_argument("--revision", help="Specify a git commit hash or tag to install", default="master") parser.add_argument("--cloudbiolinux", help="Specify a cloudbiolinux git commit hash or tag to install", default="master") parser.add_argument("--distribution", help="Operating system distribution", default="", choices=["ubuntu", "debian", "centos", "scientificlinux", "macosx"]) if len(sys.argv) == 1: parser.print_help() else: main(parser.parse_args(), sys.argv[1:])