diff --git a/conda/build.sh b/conda/build.sh index adfbe77..9f1ccb5 100755 --- a/conda/build.sh +++ b/conda/build.sh @@ -24,6 +24,7 @@ chmod +x $PREFIX/bin/bdm chmod +x $PREFIX/external/bin/linux/* chmod +x $PREFIX/external/bin/macos/* chmod +x $PREFIX/conf/scripts/*.sh +chmod +x $PREFIX/conf/scripts/scheduler/*.sh # Discard Windows stuff cd $PREFIX/external/bin && rm -rf windows diff --git a/conda/meta.yaml b/conda/meta.yaml index 6923e10..f7dc97f 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,14 +1,13 @@ {% set name = "beedeem" %} -{% set version = "4.7.5" %} -{% set sha256 = "94299b66f232288e69050cf7e475ea3acaecf07fa0b3eb36f073a3b86509183d" %} +{% set version = "5.0.0" %} +{% set sha256 = "395b8e56919e2f20715bb636d3ccaa1a4c98302547836c768b3189f131902b58" %} package: name: {{ name }} version: {{ version }} source: -# path: ../distrib/beedeem-{{ version }}.tar.gz - url: https://github.com/pgdurand/BeeDeeM/releases/download/v{{ version }}/beedeem-{{ version }}-distrib.zip + path: ../distrib/beedeem-{{ version }}.tar.gz sha256: '{{ sha256 }}' build: diff --git a/conda/run_test.sh b/conda/run_test.sh index ae538b2..d6ae21f 100755 --- a/conda/run_test.sh +++ b/conda/run_test.sh @@ -10,15 +10,55 @@ TMP_DIR=$(mktemp bdm.XXXXXXXXXX) # to override default BeeDeeM configuration export KL_WORKING_DIR=$SCRATCH/$TMP_DIR export KL_mirror__path=$KL_WORKING_DIR +export KL_LOG_TYPE=console echo "BeeDeeM Conda test within: $KL_WORKING_DIR" mkdir -p $KL_WORKING_DIR + +# == TEST 1 =========================================== +echo "*** TEST 1: Start simple bank installation" # These are two ".dsc" files located in BeeDeem-home/conf/descriptors # path DESC_LIST="PDB_proteins,SwissProt_human" # Start installation bdm install -desc $DESC_LIST +if [ $? -eq 0 ]; then + echo "TEST 1: SUCCESS" +else + echo "TEST 1: FAILED." + exit 1 +fi + +# == TEST 2 =========================================== +echo "*** TEST 2: list installed bank" +export KL_LOG_TYPE=none +bdm info -d all -f txt + +if [ $? -eq 0 ]; then + echo "TEST 2: SUCCESS" +else + echo "TEST 2: FAILED. Review log file in: $BDM_WORK_DIR" + exit 1 +fi + + +# == TEST 3 =========================================== +# Change default BeeDeeM log file name to something else +export KL_LOG_TYPE=file +export KL_LOG_FILE=query.log + +SW_ENTRY="ZZZ3_HUMAN" +echo "*** TEST 3: query bank for entry: $SW_ENTRY" +bdm query -d p -f txt -i $SW_ENTRY + +if [ $? -eq 0 ]; then + echo "TEST 3: SUCCESS" +else + echo "TEST 3: FAILED. Review log file: $BDM_WORK_DIR" + exit 1 +fi + # Note: by design of this script, we DO NOT delete # $KL_WORKING_DIR... please, do it yourself. diff --git a/conf/dbms.config b/conf/dbms.config index 2d23433..2086dc8 100755 --- a/conf/dbms.config +++ b/conf/dbms.config @@ -5,25 +5,40 @@ # This file is loaded during BeeDeeM startup to define some # usefull resources. # -# It is possible to override this default behaviour using -# environment variables, as follows. To redefine one of the +# It is possible to override this default behavior using either +# -D JRE cmdline arguments or environment variables, as follows. +# +# Using -D JRE args: prefix config variable name with KL_; +# example: +# -DKL_copy.workers=4 +# will override value defined in this configuration file by +# copy.workers (see below, line 57). +# +# Using shell variable. To redefine one of the # below listed variables (e.g. mirror.path), set a shell # variable called KL_ where you replace dot -# character by a double underscore (e.g. KL_mirror__path). +# character by a double underscore (e.g. KL_mirror__path); as a +# reminder, dot char is not allowed in shell variable names. # # Example: to force BeeDeeM using a mirror.path different than # the one set below, simply do this (bash shell) BEFORE starting # BeeDeeM: -# export KL_mirror__path=/a/new/path +# export KL_mirror__path=/a/new/path +# java .../... +# or directly: +# java .../... -DKL_mirror.path=/another/path +# +# Declaration priority is as follows: +# env var > -D > config file # # ================================================================== # Path where to install/manage local copies of databanks -mirror.path = /tmp/biobanks +mirror.path = ${HOME}/beedeem-banks # Path where to prepare local installation of databanks # before copying them in mirror.path during install in production task -#mirrorprepa.path = /tmp/biobanks/tmp +#mirrorprepa.path = ${HOME}/beedeem-banks/tmp # File storing the list of available databanks mirror.file = dbmirror.config @@ -66,8 +81,7 @@ lucene.fs = default lucene.lock = default # Following keys are for Aspera configuration -# Windows: do not add .exe extension to ascp binary # All OS: DO NOT use space in directory names !!! -aspera.bin.path=/Users/pgdurand/Applications/Aspera-CLI/bin/ascp -aspera.key.path=/Users/pgdurand/Applications/Aspera-CLI/etc/asperaweb_id_dsa.openssh +aspera.bin.path=${HOME}/Applications/Aspera-CLI/bin/ascp +aspera.key.path=${HOME}/Applications/Aspera-CLI/etc/asperaweb_id_dsa.openssh diff --git a/conf/descriptors/PDB_proteins_task.dsc b/conf/descriptors/PDB_proteins_task.dsc index eb1ed2a..697b135 100644 --- a/conf/descriptors/PDB_proteins_task.dsc +++ b/conf/descriptors/PDB_proteins_task.dsc @@ -1,9 +1,9 @@ -#PDB_proteins -#Fri Sep 15 13:47:28 CEST 2017 - # Illustrates the use of various scripts (external tasks) # DO NOT use it for production! +# Descriptor documentation, see: +# https://pgdurand.gitbook.io/beedeem/getting-started/descriptors-format + db.name=PDB_proteins_task db.desc=PDB Protein databank; illustrate use of external script call db.type=p @@ -12,7 +12,11 @@ db.ldir=${mirrordir}|p|PDB_proteins_task db.files.include=pdbaa.tar.gz db.files.exclude= -tasks.global.pre=script(name=WaitALittle;path=wait_a_little),script(name=HelloWorld;path=hello_world) +# This a pre-processing script; called once at the very beginning of bank precessing +# Illustrate a script call with additional arguments; setting an argument with '=NA" defines a no-arg parameter +# Pre-processing script is optional. +tasks.global.pre=script(name=WaitALittle;path=wait_a_little),script(name=HelloWorld;path=hello_world;-parse_seqid=NA;-k=19;-w=15;--verbose-mode=debug) + tasks.global.post=script(name=WaitALittle;path=wait_a_little),makealias,delgz,deltar,script(name=HelloWorld;path=hello_world) tasks.unit.post=script(name=WaitALittle;path=wait_a_little),gunzip,untar,script(name=HelloWorld;path=hello_world) diff --git a/conf/scripts/hello_world.sh b/conf/scripts/hello_world.sh index b2d5470..fe24e5c 100755 --- a/conf/scripts/hello_world.sh +++ b/conf/scripts/hello_world.sh @@ -1,55 +1,43 @@ #!/usr/bin/env bash -# This is a BeeDeeM external task script template. +# This is a BeeDeeM external task script example. # # This script illustrates the use of external tasks. # Such a task is called from a bank descriptor; e.g. see # for instance ../descriptors/PDB_proteins_task.dsc # -# Such a BeeDeeM script is called by the task engine and -# with these arguments: -w -d -f -n -t -# -# -w : is the working directory path. -# provided for both unit and global tasks. -# -d : is the bank installation path. -# provided for both unit and global tasks. -# -f : is the path to file under unit task processing -# only provided with unit task. -# -n : is the bank name. -# -t : is the bank type. One of p, n or d. -# p: protein -# n: nucleotide -# d: dictionary or ontology +# Such a BeeDeeM script is called by the task engine with some +# BeeDeeM specific arguments, see +# conf/scripts/scheduler/common.sh#handleBDMArgs() +# for more information. -echo "Executing an external script" -echo "Arguments coming from BeeDeeM are:" -echo $@ +# If you setup a new script, simply copy this one, keep in it +# lines 1 and 20-31 (sections "include API" and "handle arguments"), +# then do whatever you have to do! -echo "----" -# Prepare arguments for processing -WK_DIR= -INST_DIR= -PROCESSED_FILE= -BANK_NAME= -BANK_TYPE= -while getopts w:d:f:n:t: opt -do - case "$opt" in - w) WK_DIR="$OPTARG";; - d) INST_DIR="$OPTARG";; - f) PROCESSED_FILE="$OPTARG";; - n) BANK_NAME="$OPTARG";; - t) BANK_TYPE="$OPTARG";; - esac -done -shift `expr $OPTIND - 1` -# remaining arguments, if any, are stored here -MORE_ARGS=$@ +set -eo pipefail + +# ======================================================================================== +# Section: include API +S_NAME=$(realpath "$0") +[[ -z "$BDM_CONF_SCRIPTS" ]] && script_dir=$(dirname "$S_NAME") || script_dir=$BDM_CONF_SCRIPTS +. $script_dir/scheduler/common.sh + +# ======================================================================================== +# Section: handle arguments +# Function call setting BDMC_xxx variables from cmdline arguments +handleBDMArgs $@ +RET_CODE=$? +[ ! $RET_CODE -eq 0 ] && errorMsg "Wrong or missing arguments" && exit $RET_CODE + +# ======================================================================================== +# Section: do business -echo "Working dir: $WK_DIR" -echo "Install dir: $INST_DIR" -echo "Processed file: $PROCESSED_FILE" -echo "Bank name: $BANK_NAME" -echo "Bank type: $BANK_TYPE" +echo "Working directory of BeeDeeM: $BDMC_WK_DIR" +echo "Bank installation path: $BDMC_INST_DIR" +echo "Current bank file processed: $BDMC_PROCESSED_FILE" +echo "Bank name: $BDMC_BANK_NAME" +echo "Bank type: $BDMC_BANK_TYPE" +echo "Additional args: $BDMC_MORE_ARGS" echo "----" diff --git a/conf/scripts/scheduler/common.sh b/conf/scripts/scheduler/common.sh index 9bdc384..cb1a49f 100755 --- a/conf/scripts/scheduler/common.sh +++ b/conf/scripts/scheduler/common.sh @@ -102,7 +102,7 @@ BDMC_PLATFORM=$BDM_PLATFORM # # Such a BeeDeeM script is always called # with these arguments: -# -w -d -f -n -t +# -w -d -f -n -t -o # # -w : is the working directory path. # provided for both unit and global tasks. @@ -116,10 +116,13 @@ BDMC_PLATFORM=$BDM_PLATFORM # n: nucleotide # d: dictionary or ontology # -p : platform name (i.e. a specific cluster configuration) +# -o : optional. Only set if script directive passes in some +# additional arguments. +# See conf/descriptors/PDB_proteins_task.dsc for an example. function handleBDMArgs(){ infoMsg "Arguments coming from BeeDeeM are: [$@]" local OPTIND - while getopts w:d:f:n:t:p: opt + while getopts w:d:f:n:t:p:o: opt do case "$opt" in w) BDMC_WK_DIR="$OPTARG";; @@ -128,18 +131,22 @@ function handleBDMArgs(){ n) BDMC_BANK_NAME="$OPTARG";; t) BDMC_BANK_TYPE="$OPTARG";; p) BDMC_PLATFORM="$OPTARG";; + o) BDMC_MORE_ARGS="$OPTARG";; esac done shift `expr $OPTIND - 1` - BDMC_MORE_ARGS=$@ + + # Optional arguments consists in a special encoded string made by + # bzh.plealog.dbmirror.task.PTaskExecScript code + BDMC_MORE_ARGS=$(echo $BDMC_MORE_ARGS | sed -e 's/;/ /g') - infoMsg "Working dir: $BDMC_WK_DIR" - infoMsg "Install dir: $BDMC_INST_DIR" - infoMsg "Processed file: $BDMC_PROCESSED_FILE" + infoMsg "Working directory of BeeDeeM: $BDMC_WK_DIR" + infoMsg "Bank installa path: $BDMC_INST_DIR" + infoMsg "Current processed bank file: $BDMC_PROCESSED_FILE" infoMsg "Bank name: $BDMC_BANK_NAME" infoMsg "Bank type: $BDMC_BANK_TYPE" infoMsg "Platform: $BDMC_PLATFORM" - infoMsg "Remaining args: $BDMC_MORE_ARGS" + infoMsg "Remaining task script arguments: $BDMC_MORE_ARGS" } # -------- @@ -193,6 +200,10 @@ function getResources(){ # FUNCTION: figure out which Job Scheduler is available on host system # return: 0 if job scheduler found, 1 otherwise. Job scheduler name is echoed. function getScheduler(){ + if [ ! -z "$BDM_SCHEDULER" ]; then + echo $BDM_SCHEDULER + return 0 + fi local ret_value= if hasCommand qstat; then ret_value=$(qstat --version) @@ -206,3 +217,4 @@ function getScheduler(){ return 1 } +handleBDMArgs $@ diff --git a/conf/system/dbmsVersion-txt.vm b/conf/system/dbmsVersion-txt.vm index a2db135..7e9a299 100644 --- a/conf/system/dbmsVersion-txt.vm +++ b/conf/system/dbmsVersion-txt.vm @@ -17,8 +17,15 @@ Installed banks #foreach( $db in $pTable ) * $db.getName() Description: $db.getDescription() + Home dir: $db.getDbHome() BLAST+ use: -db $db.getDbPath().substring(0, $db.getDbPath().lastIndexOf(".")) Annotated bank: $db.hasAnnotation() +#if( ! ${db.getAdditionalIndex().isEmpty()} ) + Additional index: +#foreach( $key in $db.getAdditionalIndex().keySet() ) + $key : $db.getAdditionalIndex().get($key).toString() +#end +#end Size (sequences): $db.getNbSequence() Size on disk: $db.getDiskSize() Release date: $db.getReleaseTimeStamp() @@ -32,8 +39,15 @@ Installed banks #foreach( $db in $nTable ) * $db.getName() Description: $db.getDescription() + Home dir: $db.getDbHome() BLAST+ use: -db $db.getDbPath().substring(0, $db.getDbPath().lastIndexOf(".")) Annotated bank: $db.hasAnnotation() +#if( ! ${db.getAdditionalIndex().isEmpty()} ) + Additional index: +#foreach( $key in $db.getAdditionalIndex().keySet() ) + $key : $db.getAdditionalIndex().get($key).toString() +#end +#end Size (sequences): $db.getNbSequence() Size on disk: $db.getDiskSize() Release date: $db.getReleaseTimeStamp() @@ -46,6 +60,7 @@ Installed banks #if( ${dTable.size()} != 0 ) #foreach( $db in $dTable ) * $db.getName() + Home dir: $db.getDbHome() Description: $db.getDescription() Size (terms): $db.getNbSequence() Size on disk: $db.getDiskSize() diff --git a/docker/Dockerfile b/docker/Dockerfile index 4c2f7f1..56e29e8 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,36 +1,54 @@ ####################################################################################### # # Dockerfile to use BeeDeem through a Docker container. +# Package BeeDeeM and BeeDeeM-Tools by getting releases available on Github. # -# Copyright (c) 2017-21, Patrick G. Durand +# Copyright (c) 2017-23, Patrick G. Durand # ####################################################################################### # ### # Base commands. # -# We use Alpine-Oracle JRE 8 pre-build Docker Image. -# See https://github.com/platten/alpine-oracle-jre8-docker +# We use standard Ubuntu 18 Linux. # -FROM platten/alpine-oracle-jre8-docker +FROM ubuntu:18.04 # Maintainer of BeeDeeM MAINTAINER Patrick G. Durand +# ### +# Configuring release of tools to package. +# +ENV BDM_VERSION=5.0.0 +ENV BDMT_VERSION=2.1.1 + # ### # Install dependencies. +# software-properties-common required to install Java Runtime # libbz2 libidn are required by blast tools included in BeeDeeM. # libxext libxrender libxtst libxi are required by Bioinformatics-Core-API included # in BeeDeeM and BeeDeeM-Tools. # bash is required when running this image by Nextflow pipelines. # wget is required to install some banks (e.g. GeneOntology). -RUN apk update -RUN apk add --no-cache libbz2 libidn bash wget libxext libxrender libxtst libxi +# unzip is required to deploy BeeDeeM archive +# openssh-server is required to submit jobs on cluster through ssh connection +RUN apt-get -y update +RUN apt-get -y install software-properties-common bzip2 libidn11 bash wget libxext6 libxrender1 libxtst6 libxi6 unzip +RUN apt-get -y install openssh-server # ### -# Configuring BeeDeeM release. Always the latest one. -# -ENV BDM_VERSION=4.7.4 +# Java JRE 1.8 +RUN add-apt-repository ppa:ts.sch.gr/ppa \ + && echo debconf shared/accepted-oracle-license-v1-1 select true | debconf-set-selections \ + && echo debconf shared/accepted-oracle-license-v1-1 seen true | debconf-set-selections \ + && apt-get update \ + && apt-get -y install oracle-java8-installer \ + && apt-get -y install oracle-java8-set-default \ + && apt-get clean all \ + && rm -rf /var/lib/apt/lists/* \ + && cd /usr/lib/jvm/java-8-oracle\ + && rm -rf bin include lib man *.zip # ### # BeeDeeM download, installation and cleaning. @@ -47,10 +65,10 @@ RUN \ RUN \ cd /opt/beedeem && \ - cp scripts/bdm.sh ./bdm && \ + cp scripts/*.sh . && \ cp scripts/dbms.config conf && \ sed -i 's/@BIOBASE_ROOTDIR@/\/beedeem-db/g' conf/dbms.config && \ - chmod +x bdm && \ + chmod +x *.sh && \ chmod +x /opt/beedeem/external/bin/linux/* && \ chmod +x /opt/beedeem/conf/scripts/*.sh && \ cd /opt/beedeem/external/bin && rm -rf macos windows @@ -59,7 +77,6 @@ RUN \ # ### # Add BeeDeeM-tools. Always the latest one. # -ENV BDMT_VERSION=2.1.0 RUN \ cd /opt && \ mkdir beedeem-tools && \ @@ -74,10 +91,4 @@ RUN \ # BeeDeeM runtime environment variables. # ENV PATH=/opt/beedeem:/opt/beedeem-tools:$PATH -ENV KL_WORKING_DIR=/beedeem-wk -# shell variable name does not allowed use of '.' -# (mirror__path == mirror.path) -ENV KL_mirror__path=/beedeem-db -# You can override the following at runtime by using Docker "-e" argument. -ENV KL_JRE_ARGS="-Xms128M -Xmx2048M -Djava.io.tmpdir=\$KL_WORKING_DIR" diff --git a/docker/Dockerfile.jre8 b/docker/Dockerfile.jre8 new file mode 100644 index 0000000..381485a --- /dev/null +++ b/docker/Dockerfile.jre8 @@ -0,0 +1,34 @@ +####################################################################################### +# +# Dockerfile to use BeeDeem through a Docker container. +# Create Docker image from locally built BeeDeeM and BeeDeeM-Tools. +# +# Copyright (c) 2017-23, Patrick G. Durand +# +####################################################################################### + +# ### +# Base commands. +# +# We use standard Ubuntu 18 Linux. +# +FROM ubuntu:18.04 + +# Maintainer of BeeDeeM +MAINTAINER Patrick G. Durand + +RUN apt-get -y update +RUN apt-get -y install ant software-properties-common bzip2 libidn11 bash wget libxext6 libxrender1 libxtst6 libxi6 unzip + +# ### +# Java JRE 1.8 +RUN add-apt-repository ppa:ts.sch.gr/ppa \ + && echo debconf shared/accepted-oracle-license-v1-1 select true | debconf-set-selections \ + && echo debconf shared/accepted-oracle-license-v1-1 seen true | debconf-set-selections \ + && apt-get update \ + && apt-get -y install oracle-java8-installer \ + && apt-get -y install oracle-java8-set-default \ + && apt-get clean all \ + && rm -rf /var/lib/apt/lists/* \ + && cd /usr/lib/jvm/java-8-oracle + diff --git a/docker/Dockerfile.local b/docker/Dockerfile.local new file mode 100644 index 0000000..93e9157 --- /dev/null +++ b/docker/Dockerfile.local @@ -0,0 +1,94 @@ +####################################################################################### +# +# Dockerfile to use BeeDeem through a Docker container. +# Create Docker image from locally built BeeDeeM and BeeDeeM-Tools. +# +# Copyright (c) 2017-23, Patrick G. Durand +# +####################################################################################### + +# ### +# Base commands. +# +# We use standard Ubuntu 18 Linux. +# +FROM ubuntu:18.04 + +# Maintainer of BeeDeeM +MAINTAINER Patrick G. Durand + +# ### +# Configuring release of tools to package. +# +ENV BDM_VERSION=5.0.0 +ENV BDMT_VERSION=2.1.1 + +# ### +# Install dependencies. +# software-properties-common required to install Java Runtime +# libbz2 libidn are required by blast tools included in BeeDeeM. +# libxext libxrender libxtst libxi are required by Bioinformatics-Core-API included +# in BeeDeeM and BeeDeeM-Tools. +# bash is required when running this image by Nextflow pipelines. +# wget is required to install some banks (e.g. GeneOntology). +# unzip is required to deploy BeeDeeM archive +# openssh-server is required to submit jobs on cluster through ssh connection +RUN apt-get -y update +RUN apt-get -y install software-properties-common bzip2 libidn11 bash wget libxext6 libxrender1 libxtst6 libxi6 unzip +RUN apt-get -y install openssh-server + +# ### +# Java JRE 1.8 +RUN add-apt-repository ppa:ts.sch.gr/ppa \ + && echo debconf shared/accepted-oracle-license-v1-1 select true | debconf-set-selections \ + && echo debconf shared/accepted-oracle-license-v1-1 seen true | debconf-set-selections \ + && apt-get update \ + && apt-get -y install oracle-java8-installer \ + && apt-get -y install oracle-java8-set-default \ + && apt-get clean all \ + && rm -rf /var/lib/apt/lists/* \ + && cd /usr/lib/jvm/java-8-oracle\ + && rm -rf bin include lib man *.zip + +# ### +# BeeDeeM. Get pre-built local one. +# +COPY beedeem-${BDM_VERSION}-distrib.zip /opt +RUN \ + mkdir -p /opt/beedeem/tmp-install && \ + mv /opt/beedeem-${BDM_VERSION}-distrib.zip /opt/beedeem/tmp-install && \ + cd /opt/beedeem/tmp-install && \ + unzip beedeem-${BDM_VERSION}-distrib.zip && \ + tar -zxf beedeem-${BDM_VERSION}.tar.gz -C /opt/beedeem && \ + cd .. && \ + rm -rf tmp-install && \ + cp scripts/bdm.sh ./bdm && \ + sed -i 's/@KL_WORKING_DIR@/\/beedeem-wk/g' bdm && \ + sed -i 's/@JAVA_ARGS@/-Xms128M -Xmx2048M -Djava.io.tmpdir=\$KL_WORKING_DIR -DKL_LOG_TYPE=console/g' bdm && \ + cp scripts/dbms.config conf && \ + sed -i 's/@BIOBASE_ROOTDIR@/\/beedeem-db/g' conf/dbms.config && \ + chmod +x bdm && \ + chmod +x /opt/beedeem/external/bin/linux/* && \ + chmod +x /opt/beedeem/conf/scripts/*.sh && \ + chmod +x /opt/beedeem/conf/scripts/scheduler/*.sh && \ + cd /opt/beedeem/external/bin && rm -rf macos windows + +# ### +# Add BeeDeeM-tools. Use pre-built local one. +# +COPY beedeem-tools-${BDMT_VERSION}.tar.gz /opt +RUN \ + cd /opt && \ + mkdir beedeem-tools && \ + cd beedeem-tools && \ + mv ../beedeem-tools-${BDMT_VERSION}.tar.gz . && \ + gunzip beedeem-tools-${BDMT_VERSION}.tar.gz && \ + tar -xf beedeem-tools-${BDMT_VERSION}.tar && \ + rm beedeem-tools-${BDMT_VERSION}.tar && \ + chmod +x *.sh + +# ### +# BeeDeeM runtime environment variables. +# +ENV PATH=/opt/beedeem:/opt/beedeem-tools:$PATH + diff --git a/docker/README.md b/docker/README.md index d6d9b53..fac8c00 100644 --- a/docker/README.md +++ b/docker/README.md @@ -2,13 +2,23 @@ This document explains how you can setup and use *BeeDeeM* within a Docker container. +It is worth noting that this BeeDeeM image actually contains both BeeDeeM and [BeeDeeM-Tools](https://gitlab.ifremer.fr/bioinfo/BeeDeeM-Tools). + ## Requirements Of course, you need to have [Docker](https://docs.docker.com/engine/installation/) installed on your system. We also suppose that you are familiar with [docker build](https://docs.docker.com/engine/reference/commandline/build/) and [docker run](https://docs.docker.com/engine/reference/commandline/run/) commands. -Note: this BeeDeeM's *Dockerfile* was made and tested using *Docker version 20* on *macOS Catalina*. +Note: this BeeDeeM's *Dockerfile* was made and tested using *Docker engine release 20* on *macOS Monterey*. + +## Get a pre-built image + +Simply use: + +```docker pull sebimer/beedeem:5.0.0``` + +to get the combined image of BeeDeeM and [BeeDeeM-Tools](https://gitlab.ifremer.fr/bioinfo/BeeDeeM-Tools). ## Build the container @@ -20,41 +30,48 @@ Use this command: ## Run the container + KL_mirror__path=/path/to/bank_repository <-- (1) + KL_WORKING_DIR=/path/to/work-directory <-- (2) + KL_JRE_ARGS="-Xms128M -Xmx2048M -Djava.io.tmpdir=${KL_WORKING_DIR} -DKL_LOG_TYPE=console" <-- (3) + docker run --name beedeem_machine -i -t --rm \ - -v /path/to/bank/installation:/beedeem-db \ <-- (1) - -v /path/to/work/dir:/beedeem-wk \ <-- (2) - beedeem_machine <-- (3) + -e \"KL_JRE_ARGS=$KL_JRE_ARGS\" \ + -e \"KL_WORKING_DIR=$KL_WORKING_DIR\" \ + -e \"KL_mirror__path=$KL_mirror__path\" \ + -v /path/to/bank/installation:/beedeem-db \ + -v /path/to/work/dir:/beedeem-wk \ + beedeem_machine <-- (4) (1) where to install banks. Update '/path/to/...' to target your local system. - DO NOT MODIFY '/beedeem-db'. (2) where to put BeeDeeM logs. Update '/path/to/...' to target your local system. - DO NOT MODIFY '/beedeem-wk'. - (3) what to do. See 'Sample use cases', below. + (3) Arguments to run Java Runtime Environment (BeeDeeM is a Java software) + (4) what to do. See 'Sample use cases', below. +You can review the 'test_container.sh" script to look at a working exemple. ### Sample use cases 1/ install a simple bank: - docker run .../... beedeem_machine install.sh -desc PDB_proteins + docker run .../... beedeem_machine bdm install -desc PDB_proteins -Will invoke 'install.sh' BeeDeeM script. See [BeeDeeM user manual](https://pgdurand.gitbooks.io/beedeem/test_install.html\#install-a-bank) for more details. +Will invoke 'bdm' BeeDeeM script with command 'install'. See [BeeDeeM user manual](https://pgdurand.gitbooks.io/beedeem/test_install.html\#install-a-bank) for more details. 2/ install an annotated bank: - docker run .../... beedeem_machine install.sh -desc SwissProt_human + docker run .../... beedeem_machine bdm install -desc SwissProt_human 3/ get list of installed banks: - docker run .../... beedeem_machine info.sh -d all -f txt + docker run .../... beedeem_machine bdm info -d all -f txt 4/ query a bank to fetch an entry: - docker run .../... beedeem_machine query.sh -d protein -i P31946 -f txt + docker run .../... beedeem_machine bdm query -d protein -i P31946 -f txt If it fails, just try this form: - docker run .../... beedeem_machine query.sh protein P31946 txt + docker run .../... beedeem_machine bdm query protein P31946 txt ### Monitor BeeDeeM @@ -63,42 +80,22 @@ In all cases, consult BeeDeeM working directory to check out log files in case c This working directory is specified by this 'docker run' argument: - -v /path/to/work/dir:/beedeem-wk + -v /path/to/work/dir:/path/to/work/dir Which means that BeeDeeM log files can be located on your system within '/path/to/work/dir'. If needeed, you can tell BeeDeeM to dump logs directly on the console using this command: - docker run .../... -e "KL_LOG_TYPE=console" beedeem_machine install.sh -desc PDB_proteins + docker run .../... -e "KL_LOG_TYPE=console" beedeem_machine bdm install -desc PDB_proteins ### Default JRE memory usage Java is pre-configured to use up to 2 Gb RAM. You can change this by adding such an argument to your docker run command: - docker run .../... -e "KL_JRE_ARGS=-Xms128M -Xmx1G -Djava.io.tmpdir=/beedeem-wk" beedeem_machine install.sh -desc PDB_proteins + docker run .../... -e "KL_JRE_ARGS=-Xms128M -Xmx1G -Djava.io.tmpdir=/path/to/work/dir" beedeem_machine bdm install -desc PDB_proteins Tips: ALWAYS redirect appropriately JRE tmp directory to somewhere outside the container! This is the reason why you see a -Djava.io.tmpdir directive in the previous command. -### Here is a working command on my OSX computer: - -1. I created these directories: - - /Users/pgdurand/biobanks (1) - /Users/pgdurand/biobanks/log (2) - - (1) will host my banks on my computer - (2) will host BeeDeeM log files on my computer - -2. Then I can install a bank as follows: - - docker run --name beedeem_machine -i -t --rm \ - -v /Users/pgdurand/biobanks:/beedeem-db \ - -v /Users/pgdurand/biobanks/log:/beedeem-wk \ - beedeem_machine \ - install.sh -desc PDB_proteins - -In that case, BeeDeeM installs bank within directory '/beedeem-db', which actually targets '/Users/pgdurand/biobanks' through the Docker container. In a similar way, BeeDeeM creates a log file within '/beedeem-wk', which is actually '/Users/pgdurand/biobanks/log'. - ## Additional notes ### Root access inside the container @@ -108,3 +105,11 @@ You'll be able to enter into the container, as follows: - if running: docker exec -it beedeem_machine bash - if not yet running: docker run --rm -i -t beedeem_machine bash + +### Convert Docker image to Singularity + +``` +docker save beedeem- -o beedem.tar +singularity build beedeem-.sif docker-archive://beedeem.tar +``` + diff --git a/docker/test_container.sh b/docker/test_container.sh new file mode 100755 index 0000000..92b11b8 --- /dev/null +++ b/docker/test_container.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash + +# Test script for BeeDeeM docker container +# How to? +# Step 1: build image with: docker build -f Dockerfile -t beedeem-$BDM_VERSION . +# (update version to match BDM_VERSION variable, below) +# Step 2: test with either +# a. ./test_container.sh +# b. qsub test_container.sh (PBS Pro) +# c. srun test_container.sh (slurm, direct execution) +# +# P. Durand (SeBiMER, Ifremer), last update on March 2023 + +# Sample config for Slurm; adapt partition to your cluster configuration +#SBATCH -p fast +#SBATCH --mem 4GB +#SBATCH -t 0-02:00 #(D-HH:MM) +#SBATCH -o lftp.%N.%j.out # STDOUT +#SBATCH -e lftp.%N.%j.err # STDERR + +# Sample config for PBS pro; adapt queue to you cluster configuration +#PBS -q ftp +#PBS -l walltime=02:00:00 +#PBS -l mem=4g +#PBS -l ncpus=2 + +# Version of BeeDeeM to test +BDM_VERSION=5.0.0 +# Default working directory to test BeeDeeM Docker image. +# Il will be overriden below, given host platform +BDM_SCRATCH_DIR=/tmp +# What is the name of the BeeDeeM image +BDM_DKR_IMG_NAME=beedeem-${BDM_VERSION} + +# -------- +# FUNCTION: figure out whether or not a command exists. +# arg1: a command name. +# return: 0 if success +hasCommand () { + command -v "$1" >/dev/null 2>&1 +} + +# Depending on host platform, load singularity env +if hasCommand qstat; then + hname=$(hostname) + if [[ $hname == *"data"* ]]; then + echo "running on DATARMOR using PBS Pro scheduler" + source /etc/profile.d/modules.sh + module purge + module load singularity/3.4.1 + BDM_SCRATCH_DIR=$SCRATCH + fi +elif hasCommand sbatch; then + hname=$(hostname -A) + if [[ $hname == *"roscoff"* ]]; then + echo "running on ABiMS using SLURM scheduler" + BDM_PLATFORM="abims" + BDM_SCRATCH_DIR=$HOME + fi +else + echo "Cannot figure out which job scheduler is available." + echo " Execute BeeDeeM directly on THIS computer" +fi + +# Configure BeeDeeM banks and working directories +BDM_SCRATCH_DIR="$BDM_SCRATCH_DIR/test_beedeem" +BDM_BANKS_DIR="$BDM_SCRATCH_DIR/banks" +BDM_WORK_DIR="$BDM_SCRATCH_DIR/working" + +# Configure Singularity runner +BDM_BINDS="-v ${BDM_WORK_DIR}:${BDM_WORK_DIR} -v ${BDM_BANKS_DIR}:${BDM_BANKS_DIR}" + +# For debugging if neeeded: dump all BDM_XXX variables +( set -o posix ; set ) | grep "BDM_" + +# Ensure working pathes exist +mkdir -p $BDM_BANKS_DIR +mkdir -p $BDM_WORK_DIR + +# Prepare env variables to be used by BeeDeeM inside the container (mandatory) +KL_JRE_ARGS="-Xms128M -Xmx2048M -Djava.io.tmpdir=${BDM_WORK_DIR} -DKL_LOG_TYPE=console" +KL_WORKING_DIR=${BDM_WORK_DIR} +KL_mirror__path=${BDM_BANKS_DIR} + +# Set the banks to install +# These are '.dsc' files located in BeeDeeM image at path /opt/beedeem/conf/descriptors +DESCRIPTOR="SwissProt_human,PDB_proteins" + +# Now, let's start a simple installation +echo "1/2 - Start BeeDeeM test: run a bank installation" +CMD_BASE="docker run --name $BDM_DKR_IMG_NAME -i -t --rm -e \"KL_JRE_ARGS=$KL_JRE_ARGS\" -e \"KL_WORKING_DIR=$KL_WORKING_DIR\" -e \"KL_mirror__path=$KL_mirror__path\" $BDM_BINDS $BDM_DKR_IMG_NAME" +#CMD="$CMD_BASE bdm -h" +CMD="$CMD_BASE bdm install -desc $DESCRIPTOR" +echo $CMD +eval $CMD +if [ $? -eq 0 ]; then + echo "SUCCESS" +else + echo "FAILED. Review logs, above" + exit 1 +fi + +echo + +echo "2/2 - Start BeeDeeM-Tools test suite" +mkdir -p $BDM_WORK_DIR/bdm-tools +CMD="$CMD_BASE /opt/beedeem-tools/test.sh -w $BDM_WORK_DIR/bdm-tools" +echo $CMD +eval $CMD +if [ $? -eq 0 ]; then + echo "SUCCESS" +else + echo "FAILED. Review logs, above" + exit 1 +fi + + + diff --git a/scripts/bdm.sh b/scripts/bdm.sh index 654495a..b350558 100755 --- a/scripts/bdm.sh +++ b/scripts/bdm.sh @@ -32,7 +32,7 @@ # *** Bank installation scripts of BeeDeeM (conf/scripts) requires realpath: # available in BASH 5 for macOS # or available from Linux:coreutils -which realpath +RES=$(which realpath) if [ ! $? -eq 0 ]; then echo "/!\ ERROR: realpath command not found" echo " macOS: install bash 5" diff --git a/scripts/test_bdm.sh b/scripts/test_bdm.sh index a990cb7..b0dcd5b 100755 --- a/scripts/test_bdm.sh +++ b/scripts/test_bdm.sh @@ -66,7 +66,7 @@ SW_ENTRY="ZZZ3_HUMAN" echo "*** TEST 3: query bank for entry: $SW_ENTRY" # start BeeDeeM with 'query' command -bdm query -d protein -f txt -i $SW_ENTRY +bdm query -d p -f txt -i $SW_ENTRY if [ $? -eq 0 ]; then echo "TEST 3: SUCCESS" diff --git a/singularity/README.md b/singularity/README.md index 2727813..4959b7e 100644 --- a/singularity/README.md +++ b/singularity/README.md @@ -1,5 +1,12 @@ # Recipe to build a Singularity image for BeeDeeM software +It is worth noting that this BeeDeeM image actually contains both BeeDeeM and [BeeDeeM-Tools](https://gitlab.ifremer.fr/bioinfo/BeeDeeM-Tools). + +Get a pre-built image +--------------------- + +Simply go to [https://data-dataref.ifremer.fr/bioinfo/ifremer/sebimer/tools/ORSON/](https://data-dataref.ifremer.fr/bioinfo/ifremer/sebimer/tools/ORSON/) to get the lattest combined image of BeeDeeM and [BeeDeeM-Tools](https://gitlab.ifremer.fr/bioinfo/BeeDeeM-Tools); look at 'beedeem-.sif' file. + How to build? ------------- diff --git a/singularity/Singularity b/singularity/Singularity index 1fd7314..a463df9 100644 --- a/singularity/Singularity +++ b/singularity/Singularity @@ -1,12 +1,15 @@ # ############################################################################# # Recipe to build a Singularity image for BeeDeeM software +# +# Build image from official zipball available at: +# https://github.com/pgdurand/BeeDeeM/releases/download/v${BDM_VERSION}/beedeem-${BDM_VERSION}-distrib.zip # ############################################################################# Bootstrap : docker From: ubuntu:18.04 %post - BDM_VERSION=4.7.4 + BDM_VERSION=5.0.0 #apt update # software-properties-common required to install Java Runtime # libbz2 libidn are required by blast tools included in BeeDeeM. @@ -15,8 +18,9 @@ From: ubuntu:18.04 # bash is required when running this image by Nextflow pipelines. # wget is required to install some banks (e.g. GeneOntology). # unzip is required to deploy BeeDeeM archive + # openssh-server is required to submit jobs on cluster through ssh connection apt-get -y update - apt-get -y install software-properties-common bzip2 libidn11 bash wget libxext6 libxrender1 libxtst6 libxi6 unzip + apt-get -y install software-properties-common bzip2 libidn11 bash wget libxext6 libxrender1 libxtst6 libxi6 unzip openssh-server # Java add-apt-repository ppa:ts.sch.gr/ppa \ @@ -50,6 +54,7 @@ From: ubuntu:18.04 chmod +x bdm chmod +x /opt/beedeem/external/bin/linux/* chmod +x /opt/beedeem/conf/scripts/*.sh + chmod +x /opt/beedeem/conf/scripts/scheduler/*.sh cd /opt/beedeem/external/bin && rm -rf macos windows BDMT_VERSION=2.1.0 diff --git a/singularity/Singularity.local b/singularity/Singularity.local new file mode 100644 index 0000000..b441ada --- /dev/null +++ b/singularity/Singularity.local @@ -0,0 +1,76 @@ +# ############################################################################# +# Recipe to build a Singularity image for BeeDeeM software +# +# Build image from freshly built BeeDeeM zipball available in +# ../distrib : for DEV/DEBUG purpose only to prepare/test +# official release. +# ############################################################################# + +Bootstrap : docker +From: ubuntu:18.04 + +%files + ../distrib/beedeem-*-distrib.zip /opt + +%post + BDM_VERSION=5.0.0 + #apt update + # software-properties-common required to install Java Runtime + # libbz2 libidn are required by blast tools included in BeeDeeM. + # libxext libxrender libxtst libxi are required by Bioinformatics-Core-API included + # in BeeDeeM and BeeDeeM-Tools. + # bash is required when running this image by Nextflow pipelines. + # wget is required to install some banks (e.g. GeneOntology). + # unzip is required to deploy BeeDeeM archive + # openssh-server is required to submit jobs on cluster through ssh connection + apt-get -y update + apt-get -y install software-properties-common bzip2 libidn11 bash wget libxext6 libxrender1 libxtst6 libxi6 unzip openssh-server + + # Java + add-apt-repository ppa:ts.sch.gr/ppa \ + && echo debconf shared/accepted-oracle-license-v1-1 select true | debconf-set-selections \ + && echo debconf shared/accepted-oracle-license-v1-1 seen true | debconf-set-selections \ + && apt-get update \ + && apt-get -y install oracle-java8-installer \ + && apt-get -y install oracle-java8-set-default \ + && apt-get clean all \ + && rm -rf /var/lib/apt/lists/* + cd /usr/lib/jvm/java-8-oracle + rm -rf bin include lib man *.zip + + # Get BeeDeeM from latest release + mkdir -p /opt/beedeem/tmp-install + mv /opt/beedeem-${BDM_VERSION}-distrib.zip /opt/beedeem/tmp-install + cd /opt/beedeem/tmp-install + + # Unpack BeeDeeM to /opt/beedeem + unzip beedeem-${BDM_VERSION}-distrib.zip + tar -zxf beedeem-${BDM_VERSION}.tar.gz -C /opt/beedeem + cd .. + rm -rf tmp-install + + # Do a little configuration + cp scripts/bdm.sh ./bdm + sed -i 's/@KL_WORKING_DIR@/\/beedeem-wk/g' bdm + sed -i 's/@JAVA_ARGS@/-Xms128M -Xmx2048M -Djava.io.tmpdir=\$KL_WORKING_DIR -DKL_LOG_TYPE=console/g' bdm + cp scripts/dbms.config conf + sed -i 's/@BIOBASE_ROOTDIR@/\/beedeem-db/g' conf/dbms.config + chmod +x bdm + chmod +x /opt/beedeem/external/bin/linux/* + chmod +x /opt/beedeem/conf/scripts/*.sh + chmod +x /opt/beedeem/conf/scripts/scheduler/*.sh + cd /opt/beedeem/external/bin && rm -rf macos windows + + BDMT_VERSION=2.1.0 + cd /opt + mkdir beedeem-tools + cd beedeem-tools + wget https://github.com/ifremer-bioinformatics/BeeDeeM-Tools/releases/download/v${BDMT_VERSION}/beedeem-tools-${BDMT_VERSION}.tar.gz + gunzip beedeem-tools-${BDMT_VERSION}.tar.gz + tar -xf beedeem-tools-${BDMT_VERSION}.tar + rm beedeem-tools-${BDMT_VERSION}.tar + chmod +x *.sh + + +%environment + export PATH="/opt/beedeem:/opt/beedeem-tools:$PATH" diff --git a/singularity/test_container.sh b/singularity/test_container.sh index e887069..e7b9093 100755 --- a/singularity/test_container.sh +++ b/singularity/test_container.sh @@ -5,7 +5,7 @@ # # How to? # -# Step 1: build image with: singularity build -f beedeem-4.7.6.sif Singularity +# Step 1: build image with: singularity build -f beedeem-5.0.0.sif Singularity # (update version to match BDM_VERSION variable, below) # # Step 2: test with either @@ -35,7 +35,7 @@ # Section 1: prepare BeeDeeM test suite # Version of BeeDeeM to test -BDM_VERSION=4.7.6 +BDM_VERSION=5.0.0 # Default working directory to test BeeDeeM Singularity image. # Il will be overriden below, given host platform BDM_SCRATCH_DIR=/tmp @@ -162,8 +162,7 @@ echo "########################################################################## echo "# Start BeeDeeM test bank installation" # These are '.dsc' files located in BeeDeeM image at path /opt/beedeem/conf/descriptors DESCRIPTOR="SwissProt_human,PDB_proteins" -#DESCRIPTOR="SwissProt_human" -CMD="singularity run ${BDM_BINDS} ${BDM_SINGULITY_IMG} install.sh -desc ${DESCRIPTOR}" +CMD="singularity run ${BDM_BINDS} ${BDM_SINGULITY_IMG} bdm install -desc ${DESCRIPTOR}" echo $CMD eval $CMD if [ $? -eq 0 ]; then @@ -191,7 +190,7 @@ fi # see https://github.com/pgdurand/BlastViewer echo "###############################################################################" echo "# Annotate results" -CMD="annotate.sh -i $KL_WORKING_DIR/query_vs_SW.xml -o $KL_WORKING_DIR/query_vs_SW.zml -type full -writer zml" +CMD="bdm annotate -i $KL_WORKING_DIR/query_vs_SW.xml -o $KL_WORKING_DIR/query_vs_SW.zml -type full -writer zml" CMD="singularity run ${BDM_BINDS} ${BDM_SINGULITY_IMG} $CMD" echo $CMD eval $CMD diff --git a/src/bzh/plealog/dbmirror/fetcher/DBServerConfig.java b/src/bzh/plealog/dbmirror/fetcher/DBServerConfig.java index 193a8a4..c99d9eb 100755 --- a/src/bzh/plealog/dbmirror/fetcher/DBServerConfig.java +++ b/src/bzh/plealog/dbmirror/fetcher/DBServerConfig.java @@ -521,7 +521,7 @@ public void setLocalFolder(String folderPath) { public String getLocalTmpFolder() { return getLocalFolder() + DBMSAbstractConfig.DOWNLOADING_DIR + File.separator - + DBMSExecNativeCommand.formatNativePath(getName(), false, false); + + DBMSExecNativeCommand.formatNativePath(getName(), false, false);/*GT*/ } /** @@ -531,7 +531,7 @@ public String getLocalTmpFolder() { public String getLocalProdFolder() { return Paths.get(DBMSAbstractConfig.getLocalMirrorPath(), this.getTypeCode(), this.getName(), DBMSAbstractConfig.CURRENT_DIR, - DBMSExecNativeCommand.formatNativePath(getName(), false, false)) + DBMSExecNativeCommand.formatNativePath(getName(), false, false))/*GT*/ .toString(); } diff --git a/src/bzh/plealog/dbmirror/fetcher/PFTPLoaderSystem.java b/src/bzh/plealog/dbmirror/fetcher/PFTPLoaderSystem.java index 5f57444..559495b 100755 --- a/src/bzh/plealog/dbmirror/fetcher/PFTPLoaderSystem.java +++ b/src/bzh/plealog/dbmirror/fetcher/PFTPLoaderSystem.java @@ -224,7 +224,9 @@ public synchronized void runProcessing() { } catch (InterruptedException e1) { //not bad } - + //In case pre-processing task(s) failed, stop installing this bank + if (LoggerCentral.errorMsgEmitted() || LoggerCentral.processAborted()) + break; // FTP or Local installation? if (dbConf.getAddress() != null && !dbConf.getAddress().equals("")) { LoggerCentral.info(LOGGER, "FTP descriptor file: " + fName); @@ -299,7 +301,7 @@ public synchronized void runProcessing() { //date of bank currently installed String dbPathCur = Paths.get(DBMSAbstractConfig.getLocalMirrorPath(), - dbConf.getTypeCode(), dbConf.getName(), + dbConf.getTypeCode(), dbConf.getName(), /*GT*/ DBMSAbstractConfig.CURRENT_DIR, dbConf.getName()).toString(); // caution: first time bank installation, dbPathCur does not exist if (new File(dbPathCur).exists()){ @@ -527,7 +529,7 @@ public void run() { if (LoggerCentral.errorMsgEmitted()) { LoggerCentral.error( LogFactory.getLog(DBMSAbstractConfig.KDMS_ROOTLOG_CATEGORY - + ".PMirror"), "PROCESSING: FAILED. Check ERROR messages in logs."); + + ".PMirror"), "PROCESSING: FAILED. Check ERROR, WARN messages in logs."); } else if (LoggerCentral.processAborted()) { LoggerCentral.error( LogFactory.getLog(DBMSAbstractConfig.KDMS_ROOTLOG_CATEGORY diff --git a/src/bzh/plealog/dbmirror/main/CmdLineQuery.java b/src/bzh/plealog/dbmirror/main/CmdLineQuery.java index 4034639..fb57e8a 100755 --- a/src/bzh/plealog/dbmirror/main/CmdLineQuery.java +++ b/src/bzh/plealog/dbmirror/main/CmdLineQuery.java @@ -34,7 +34,7 @@ /** * This is the class to use to query the databanks managed with BeeDeeM. * Command line is as follows:
- * -d databank, one of: protein, nucleotide or dico
+ * -d databank, one of: p, n or d
* -i comma-separated list of sequence IDs, of path to a file of seqIDs (one per line)
* -f format. One of: txt, fas, html, insd, finsd.
* -o output. If not set, default to stdout.
@@ -102,6 +102,25 @@ private Options getCmdLineOptions() { return opts; } + /** + * Get repository for which we have to execute query. Default is nucleotide. + */ + private String getDatabaseType(CommandLine cmdLine) { + String val = cmdLine.getOptionValue(DATABASE); + if ("n".equals(val)) { + return "nucleotide"; + } + else if ("p".equals(val)) { + return "protein"; + } + else if ("d".equals(val)) { + return "dico"; + } + else if (val.contains("d:")) { + return "dico:"+val.substring(val.indexOf(':')+1); + } + else return "nucleotide"; + } @Override public boolean execute(String[] args) { PQueryMirrorBase qm; @@ -123,7 +142,7 @@ public boolean execute(String[] args) { // get the key/value data and process the query values = new Hashtable<>(); - values.put("database", cmdLine.getOptionValue(DATABASE)); + values.put("database", getDatabaseType(cmdLine)); values.put("id", cmdLine.getOptionValue(SEQID)); values.put("format", cmdLine.getOptionValue(FORMAT)); diff --git a/src/bzh/plealog/dbmirror/main/CmdLineUtils.java b/src/bzh/plealog/dbmirror/main/CmdLineUtils.java index 6a09988..6b6dd29 100644 --- a/src/bzh/plealog/dbmirror/main/CmdLineUtils.java +++ b/src/bzh/plealog/dbmirror/main/CmdLineUtils.java @@ -69,7 +69,7 @@ private static String getFooter() { buf.append(" -DKL_CONF_DIR=/path/to/new/conf_dir . Such a path must target all expected conf sub-directories (system, scripts, descriptors)\n"); buf.append("--\n"); buf.append("To override dbms.config values, use JRE args:\n"); - buf.append(" -DKL_=, where is a dbms.config key and is a value. For key, replace '.' by '__' (double underscore). E.g to override default 'mirror.path' value, use JRE argument -DKL_mirror__path=/new/path\n"); + buf.append(" -DKL_=, where is a dbms.config key and is a value. For key, replace '.' by '__' (double underscore). E.g to override default 'mirror.path' value, use JRE argument -DKL_mirror.path=/new/path\n"); buf.append("--\n"); buf.append(props.getProperty("prg.app.name")); buf.append(" "); diff --git a/src/bzh/plealog/dbmirror/main/DeleteBank.java b/src/bzh/plealog/dbmirror/main/DeleteBank.java index 9756c23..564c56d 100644 --- a/src/bzh/plealog/dbmirror/main/DeleteBank.java +++ b/src/bzh/plealog/dbmirror/main/DeleteBank.java @@ -1,4 +1,20 @@ -package bzh.plealog.dbmirror.main; +/* Copyright (C) 2007-2023 Patrick G. Durand + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * You may obtain a copy of the License at + * + * https://www.gnu.org/licenses/agpl-3.0.txt + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + */ + package bzh.plealog.dbmirror.main; import java.text.MessageFormat; import java.util.List; @@ -20,9 +36,9 @@ /** * A utility tool to delete bank from he command-line. Command-line arguments are:
* - * -code : index code of the bank to delete. Such a code can be obtained - * using the 'info' tool (use 'code' format).
- * -info: display bank directory to be deleted WITHOUT deleting it!
+ * -n : name of the bank to delete. Such a name can be obtained + * using the 'info' tool. Bank name is case sensitive.
+ * -p: print bank directory to be deleted WITHOUT deleting it!
* * In addition, some parameters can be passed to the JVM for special * configuration purposes:
@@ -43,8 +59,8 @@ @BdmTool(command="delete", description="delete bank(s)") public class DeleteBank implements BdmToolApi{ - private static final String CODE_ARG = "code"; - private static final String INFO_ARG = "info"; + private static final String NAME_ARG = "n"; + private static final String INFO_ARG = "p"; /** * Setup the valid command-line of the application. @@ -58,7 +74,7 @@ private Options getCmdLineOptions() { .hasArg() .isRequired() .withDescription(DBMSMessages.getString("Tool.DeleteBank.arg1.desc")) - .create( CODE_ARG ); + .create( NAME_ARG ); opts = new Options(); opts.addOption(idx); @@ -85,7 +101,7 @@ public boolean execute(String[] args) { StarterUtils.configureApplication( null, DBMSMessages.getString("Tool.DeleteBank.name"), - true, false, true); + true, false, false); // Handle command-line cmdLine = CmdLineUtils.handleArguments( @@ -102,9 +118,9 @@ public boolean execute(String[] args) { descriptors = DBDescriptorUtils.prepareIndexDBList(conf); // Locate the bank to delete - dbCode = cmdLine.getOptionValue(CODE_ARG); + dbCode = cmdLine.getOptionValue(NAME_ARG); for (IdxDescriptor idx : descriptors){ - if (idx.getKbCode().equals(dbCode)){ + if (idx.getName().equals(dbCode)){ desc = idx; break; } diff --git a/src/bzh/plealog/dbmirror/main/DumpBankList.java b/src/bzh/plealog/dbmirror/main/DumpBankList.java index 32f6430..5000f86 100644 --- a/src/bzh/plealog/dbmirror/main/DumpBankList.java +++ b/src/bzh/plealog/dbmirror/main/DumpBankList.java @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2022 Patrick G. Durand +/* Copyright (C) 2007-2023 Patrick G. Durand * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -54,7 +54,7 @@ /** * This is the class to use to report the list of installed banks. Accepted * arguments are:
- * -d type of repository. One of: n, p, b, all. Default is: all.
+ * -d type of repository. One of: n, p, d, all. Default is: all.
* -f format. One of: txt, html, galaxy. Default is: txt.
* -u user-name.
* In addition, some parameters can be passed to the JVM for special @@ -262,7 +262,7 @@ protected boolean doJob(OutputStream os, String db, String ft, String us){ dbs2.put("mirror_p", dbList); dbs2.put("mirror_n", emptyList); dbs2.put("mirror_d", emptyList); - } else if ("b".equals(db)) { + } else if ("d".equals(db)) { // only biological classification dbList = getMirrorDBList(DBDescriptor.TYPE.dico, us); dbTotalSize = countSize(dbTotalSize, dbList); diff --git a/src/bzh/plealog/dbmirror/task/PPreTaskProcessor.java b/src/bzh/plealog/dbmirror/task/PPreTaskProcessor.java index ba241e1..6dbf9a8 100644 --- a/src/bzh/plealog/dbmirror/task/PPreTaskProcessor.java +++ b/src/bzh/plealog/dbmirror/task/PPreTaskProcessor.java @@ -102,6 +102,7 @@ private void stackTasks() { @Override public void run() { + boolean initThread=true; while (true) { //force exit from this loop if no more tasks under processing //or user request to cancel bank installation (UI only) @@ -110,9 +111,12 @@ public void run() { break; } try { - LoggerCentral.info( + if (initThread) {//display this message one times + initThread = false; + LoggerCentral.info( LogFactory.getLog(DBMSAbstractConfig.KDMS_ROOTLOG_CATEGORY + ".PPreTaskProcessor"), WAIT_MSG); + } sleep(_waitTime); } catch (InterruptedException e) { } diff --git a/src/bzh/plealog/dbmirror/task/PTaskExecScript.java b/src/bzh/plealog/dbmirror/task/PTaskExecScript.java index 2d3227e..62135ae 100644 --- a/src/bzh/plealog/dbmirror/task/PTaskExecScript.java +++ b/src/bzh/plealog/dbmirror/task/PTaskExecScript.java @@ -47,6 +47,7 @@ public class PTaskExecScript extends PAbstractTask { private String _errMsg; private String _bankName; private String _bankType; + private String _calledScriptArguments; protected Map _args; //mandatory arguments user has to provide in 'script()' task @@ -58,6 +59,8 @@ public class PTaskExecScript extends PAbstractTask { public static final String INST_FILE_ARG = "-f"; // path to file (unit task only) public static final String BANK_NAME_ARG = "-n"; // bank name public static final String BANK_TYPE_ARG = "-t"; // bank type (p,n,d) + public static final String SCRIPT_ARGS_ARG = "-o"; // additional arguments to transmit to script + public static final String NO_ARG = "NA"; public static final String UNIX_FILE_EXT = ".sh"; public static final String WIN_FILE_EXT = ".bat"; @@ -79,7 +82,7 @@ public PTaskExecScript(String dbPath, String currentFile, String bankName, Strin _curFile = currentFile; _bankName = bankName; _bankType = bankType; - + _calledScriptArguments = null; } /** @@ -93,7 +96,7 @@ public void setParameters(String params) { _args = Utils.getTaskArguments(params); _scriptName = _args.get(SCRIPT_NAME); //to ensure using software on all OS, script is passed in without - //file extension. Its added here according to OS. + //file extension. It is added here according to OS. _scriptCmd = _args.get(SCRIPT_CMD_PATH); if (DBMSExecNativeCommand.getOSType()==DBMSExecNativeCommand.WINDOWS_OS) { _scriptCmd += WIN_FILE_EXT; @@ -101,15 +104,28 @@ public void setParameters(String params) { else { _scriptCmd += UNIX_FILE_EXT; } + _args.remove(SCRIPT_NAME); + _args.remove(SCRIPT_CMD_PATH); + //Additional arguments are supposed to be ones to transmit to script. + //For that purpose, we encode a special string. It'll be decoded by + //conf/scripts/scheduler/common.sh script + if (!_args.isEmpty()) { + StringBuffer buf = new StringBuffer("'"); + String val; + for (String key : _args.keySet()) { + buf.append(key); + buf.append(";"); + val = _args.get(key); + if (!NO_ARG.equals(val)) { + buf.append(val); + buf.append(";"); + } + } + buf.append("'"); + _calledScriptArguments=buf.toString(); + } } - if (_dbInstallationPath!=null) { - _args.put(INST_DIR_ARG, _dbInstallationPath); - - } - if (_curFile!=null) { - _args.put(INST_FILE_ARG, _curFile); - - } + } /** @@ -194,6 +210,9 @@ public boolean execute() { } params.put(BANK_NAME_ARG, new CommandArgument(_bankName, false)); params.put(BANK_TYPE_ARG, new CommandArgument(_bankType, false)); + if(_calledScriptArguments != null) { + params.put(SCRIPT_ARGS_ARG, new CommandArgument(_calledScriptArguments, false)); + } Process proc = executor.executeAndReturn(_scriptCmd, params); @@ -220,6 +239,9 @@ public boolean execute() { if (exitCode==0) { PAbstractTask.setTaskOkForFile(resumeFile); } + else { + _errMsg = String.format("unexpected shell exit code: %d", exitCode); + } return exitCode==0; } diff --git a/src/bzh/plealog/dbmirror/task/PTaskHandleHistory.java b/src/bzh/plealog/dbmirror/task/PTaskHandleHistory.java index 3a93434..61c8f18 100755 --- a/src/bzh/plealog/dbmirror/task/PTaskHandleHistory.java +++ b/src/bzh/plealog/dbmirror/task/PTaskHandleHistory.java @@ -82,7 +82,7 @@ private boolean handleHistory() { if (f.isFile()) continue; dirName = f.getName(); - if (dirName.startsWith("currentOn") == false) + if (dirName.startsWith(DBMSAbstractConfig.CURRENTON_DIR) == false) continue; dirNames.add(dirName); } diff --git a/src/bzh/plealog/dbmirror/task/PTaskInstallInProduction.java b/src/bzh/plealog/dbmirror/task/PTaskInstallInProduction.java index 79d6321..04c3ddc 100755 --- a/src/bzh/plealog/dbmirror/task/PTaskInstallInProduction.java +++ b/src/bzh/plealog/dbmirror/task/PTaskInstallInProduction.java @@ -445,7 +445,7 @@ private boolean doJob() { // check if we have some sequences in the db dbSizes = getTotalEntries(Utils.terminatePath(dbPathDStamp) - + db.getName()); + + db.getName());/*GT*/ if ((dbSizes[0] == 0) && (dbSizes[1] == 0)) { throw new Exception( "unable to install Blast databank in production: no sequences"); @@ -465,8 +465,13 @@ private boolean doJob() { fCur = new File(dbPathCur); // rename current Production dir to something else if (fCur.exists()) { + String curRelDate = DBStampProperties.getDBTimeStampAsDirStr( + Utils.terminatePath(dbPathCur) + db.getName()); + if (curRelDate==null) { + curRelDate = DBMSAbstractConfig.getStarterDate(); + } str = dbPath + DBMSAbstractConfig.CURRENTON_DIR - + DBMSAbstractConfig.getStarterDate(); + + curRelDate; // when reloading a mirror, str may already exists : destroy it if (new File(str).exists()) { if (!PAntTasks.deleteDirectory(str)) { diff --git a/src/bzh/plealog/dbmirror/ui/resources/messages.properties b/src/bzh/plealog/dbmirror/ui/resources/messages.properties index e7b11ee..be9e8ab 100755 --- a/src/bzh/plealog/dbmirror/ui/resources/messages.properties +++ b/src/bzh/plealog/dbmirror/ui/resources/messages.properties @@ -318,11 +318,11 @@ Tool.Master.err3.cmd=Error while scanning for BeeDeeM main classes\: Tool.Master.err4.cmd=Class not found\: Tool.DeleteBank.name=DeleteBank -Tool.DeleteBank.arg1.lbl=bank-code -Tool.DeleteBank.arg1.desc=index code of the bank to delete. Such a code can be obtained using the 'info' tool (use 'code' format). +Tool.DeleteBank.arg1.lbl=bank-name +Tool.DeleteBank.arg1.desc=name of the bank to delete. Such a name can be obtained using the 'info' tool. Bank name is case sensitive. Tool.DeleteBank.arg2.desc=display bank directory to be deleted WITHOUT deleting it! Tool.DeleteBank.info.msg1=The content of {0} will be deleted\nas well as all databanks therein:\n{1} -Tool.DeleteBank.err.msg1=Bank with code: {0}: not found +Tool.DeleteBank.err.msg1=Bank with name: {0}: not found Tool.DeleteBank.info.msg2=Bank: {0}: successfully deleted Tool.Annotate.name=Annotate @@ -339,7 +339,7 @@ Tool.Annotate.arg5.desc=include Biological Classification data. Use true or fals Tool.Query.name=Query Tool.Query.arg1.lbl=repository -Tool.Query.arg1.desc=type of repository to query. Mandatory. One of: nucleotide, protein, dico. When using dico, use one of: dico:taxon, dico:EC, dico:GO, dico:CDD or dico:InterPro. When using dico:taxon, entry ID can be either a TaxID or a Taxonomy Name (e.g. organism, phylum, etc.). In latter case, Query Tool will dump Taxonomy path. +Tool.Query.arg1.desc=type of repository to query. Mandatory. One of: n, p, d. When using d, use one of: d:taxon, d:EC, d:GO, d:CDD or d:InterPro. When using d:taxon, entry ID can be either a TaxID or a Taxonomy Name (e.g. organism, phylum, etc.). In latter case, Query Tool will dump Taxonomy path. Tool.Query.arg2.lbl=entryID Tool.Query.arg2.desc=either a single entry ID, a comma separated list of entry IDs of a path to a file of entry IDs. Mandatory. When using a file of IDs, provide a single ID per line. Tool.Query.arg3.lbl=format @@ -349,7 +349,7 @@ Tool.Query.arg4.desc=output file to save results of query. Optional, default is Tool.Dump.name=DumpBankList Tool.Dump.arg1.lbl=repository -Tool.Dump.arg1.desc=type of repository. One of: n, p, b, all. Default is: all. +Tool.Dump.arg1.desc=type of repository. One of: n, p, d, all. Default is: all. Tool.Dump.arg2.lbl=format Tool.Dump.arg2.desc=format. One of: txt, html, galaxy. Default is: txt. Tool.Dump.arg3.lbl=user diff --git a/src/bzh/plealog/dbmirror/util/conf/BankJsonDescriptor.java b/src/bzh/plealog/dbmirror/util/conf/BankJsonDescriptor.java index 4131ee2..4cd20c0 100644 --- a/src/bzh/plealog/dbmirror/util/conf/BankJsonDescriptor.java +++ b/src/bzh/plealog/dbmirror/util/conf/BankJsonDescriptor.java @@ -1,4 +1,4 @@ -/* Copyright (C) 2021 Patrick G. Durand +/* Copyright (C) 2021-2022 Patrick G. Durand * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -55,6 +55,7 @@ public class BankJsonDescriptor { public static final String OTHER_INDEX_PROPS = "index.properties"; public static final String OTHER_INDEX_PROP_LBL = "label"; public static final String OTHER_INDEX_PROP_KEY = "key"; + public static final String OTHER_INDEX_PROP_VER = "version"; public static final String DEFAULT_DESCRIPTOR_FNAME = "databank.json"; diff --git a/src/bzh/plealog/dbmirror/util/conf/DBMSConfigurator.java b/src/bzh/plealog/dbmirror/util/conf/DBMSConfigurator.java index b1a0431..8a1da60 100755 --- a/src/bzh/plealog/dbmirror/util/conf/DBMSConfigurator.java +++ b/src/bzh/plealog/dbmirror/util/conf/DBMSConfigurator.java @@ -157,14 +157,14 @@ public void load(String path, boolean listenReload) throws IOException { * @see java.util.Properties#getProperty(java.lang.String) */ public String getProperty(String key) { - //this was added to enable overriding config file properties using java -D arguments or standard - //environment variables. + //added to get config from environment variables + //note: using '.' is not allowed for shell variables names, so use "__" instead String pkey = DBMSAbstractConfigConstants.APP_KEY_PREFIX+key; - String value = DBMSAbstractConfigConstants.pruneQuotes(System.getProperty(pkey)); + String value = DBMSAbstractConfigConstants.pruneQuotes(System.getenv(pkey.replaceAll("\\.", "__"))); if (value==null) { - //added to get config from environment variables - //note: using '.' is not allowed for shell variables names, so use "__" instead - value = DBMSAbstractConfigConstants.pruneQuotes(System.getenv(pkey.replaceAll("\\.", "__"))); + //this was added to enable overriding config file properties using java -D arguments or standard + //environment variables. + value = DBMSAbstractConfigConstants.pruneQuotes(System.getProperty(pkey)); } return value !=null ? value : _pConfig.getString(key); } diff --git a/src/bzh/plealog/dbmirror/util/descriptor/DatabankDescriptor.java b/src/bzh/plealog/dbmirror/util/descriptor/DatabankDescriptor.java index 542ed7a..9438248 100755 --- a/src/bzh/plealog/dbmirror/util/descriptor/DatabankDescriptor.java +++ b/src/bzh/plealog/dbmirror/util/descriptor/DatabankDescriptor.java @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2017 Patrick G. Durand +/* Copyright (C) 2007-2023 Patrick G. Durand * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -17,15 +17,25 @@ package bzh.plealog.dbmirror.util.descriptor; import java.io.File; +import java.io.FileReader; +import java.io.IOException; import java.io.Serializable; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; import java.text.DecimalFormat; import java.text.NumberFormat; +import java.util.HashMap; +import java.util.List; import java.util.Locale; +import java.util.Map; import java.util.Properties; +import java.util.stream.Collectors; import org.apache.commons.io.FileUtils; import bzh.plealog.dbmirror.util.Utils; +import bzh.plealog.dbmirror.util.conf.BankJsonDescriptor; import bzh.plealog.dbmirror.util.descriptor.DBDescriptor.TYPE; import bzh.plealog.dbmirror.util.runner.DBStampProperties; @@ -38,7 +48,8 @@ public class DatabankDescriptor implements Serializable { private static final long serialVersionUID = 5222950557207540653L; private String _name; - private String _dbPath; + private String _dbHome;//home dir of bank + private String _dbPath;//path to main index, e.g. blast bank alias private String _code; private String _description; private String _type; @@ -46,6 +57,7 @@ public class DatabankDescriptor implements Serializable { private String _diskSize; private String _timeStamp; private String _releaseStamp; + private HashMap _otherIndex; private long _diskSizeL; private boolean hasAnnotation = false; @@ -53,6 +65,7 @@ public class DatabankDescriptor implements Serializable { .getInstance(Locale.ENGLISH); public DatabankDescriptor(IdxDescriptor descriptor) { + _otherIndex = new HashMap<>(); _name = descriptor.getName(); _dbPath = descriptor.getCode(); _description = descriptor.getDescription(); @@ -65,8 +78,8 @@ else if (descriptor.getType().equals(TYPE.blastp) || descriptor.getType().equals else if (descriptor.getType().equals(TYPE.dico)) _type = "D"; - String directory = new File(_dbPath).getParent(); - Properties props = DBStampProperties.readDBStamp(directory); + _dbHome = new File(_dbPath).getParent(); + Properties props = DBStampProperties.readDBStamp(_dbHome); _timeStamp = props.getProperty(DBStampProperties.TIME_STAMP); _releaseStamp = props.getProperty(DBStampProperties.RELEASE_TIME_STAMP); @@ -77,8 +90,11 @@ else if (descriptor.getType().equals(TYPE.dico)) else _nbSequence = numFormatter.format(Long.valueOf(props .getProperty(DBStampProperties.NB_SEQUENCES))); - _diskSizeL = FileUtils.sizeOfDirectory(new File(directory)); + _diskSizeL = FileUtils.sizeOfDirectory(new File(_dbHome)); _diskSize = Utils.getBytes(_diskSizeL); + //_otherIndex.put("diamond:2.0.6", _dbHome); + //_otherIndex.put("blast-v4:2.6.0", _dbHome); + scanForOtherIndex(_dbHome); } /** @@ -95,6 +111,13 @@ public String getDbPath() { return _dbPath; } + /** + * @return the _dbHome + */ + public String getDbHome() { + return _dbHome; + } + /** * @return the _code */ @@ -150,4 +173,49 @@ public String getReleaseTimeStamp() { return _releaseStamp; } + public Map getAdditionalIndex(){ + return _otherIndex; + } + + private void scanForOtherIndex(String directory) { + List files; + //within installation directory, look for all sub-dir terminating with .idx + try { + files = Files.list(Paths.get(directory)) + .filter(Files::isDirectory) + .filter(path -> path.toString().endsWith(BankJsonDescriptor.OTHER_INDEX_FEXT)) + .map(Path::toFile) + .collect(Collectors.toList()); + } catch (IOException e) { + //LoggerCentral.warn("Unable to list additional indexes: "+e.toString()); + return; + } + // then, process additional indexes if any (bowtie, diamond, etc) + for(File idxDirectory : files) { + //Do we have a dedicated an index.properties file? + String idxPath = idxDirectory.getAbsolutePath(); + File propFile = new File(Utils.terminatePath(idxPath) + +BankJsonDescriptor.OTHER_INDEX_PROPS); + if (propFile.exists()){ + Properties props = new Properties(); + try (FileReader fr = new FileReader(propFile)){ + props.load(fr); + String idxKey = + props.getProperty(BankJsonDescriptor.OTHER_INDEX_PROP_KEY) + + " (" + + props.getProperty(BankJsonDescriptor.OTHER_INDEX_PROP_VER + ")"); + _otherIndex.put(idxKey, idxPath); + } catch (Exception e) { + //LOGGER.warn("Unable to read property file: "+propFile+": "+e.toString()); + } + } + //otherwise, use directory name has index key + else { + String fName = idxDirectory.getName(); + int idx = fName.lastIndexOf('.'); + _otherIndex.put(fName.substring(0, idx), idxPath); + } + + } + } } diff --git a/src/bzh/plealog/dbmirror/util/runner/DBMSExecNativeCommand.java b/src/bzh/plealog/dbmirror/util/runner/DBMSExecNativeCommand.java index 3dfc08e..41bb780 100755 --- a/src/bzh/plealog/dbmirror/util/runner/DBMSExecNativeCommand.java +++ b/src/bzh/plealog/dbmirror/util/runner/DBMSExecNativeCommand.java @@ -36,6 +36,7 @@ import bzh.plealog.dbmirror.util.Utils; import bzh.plealog.dbmirror.util.conf.DBMSAbstractConfig; +import bzh.plealog.dbmirror.util.conf.DBMSAbstractConfigConstants; import bzh.plealog.dbmirror.util.log.LoggerCentral; /** @@ -59,6 +60,7 @@ public class DBMSExecNativeCommand { public static final String APPDIR_VAR_NAME = "${appdir}"; public static final String JTMPDIR_VAR_NAME = "${javaTempDir}"; public static final String WORKDIR_VAR_NAME = "${workdir}"; + public static final String OS_NAME_VAR_NAME = "${os}"; public static final int EXEC_INTERRUPTED = -2; public static final long DEFAULT_TIME_SLICE = 2000; //milliseconds @@ -160,8 +162,9 @@ public static String formatNativePath(String path, boolean appendOSname, boolean removeEndingSeparator) { StringBuffer szBuf; StringTokenizer tokenizer; - String token; - + String token, varName, varValue; + int idxS, idxE; + szBuf = new StringBuffer(); if (path.charAt(0) == '|' || path.charAt(0) == '/' || path.charAt(0) == '\\') @@ -183,9 +186,23 @@ public static String formatNativePath(String path, boolean appendOSname, szBuf.append(DBMSAbstractConfig.getLocalMirrorPrepaPath()); } else if (token.equalsIgnoreCase(WORKDIR_VAR_NAME)) { szBuf.append(DBMSAbstractConfig.getWorkingPath()); - } else if (token.equalsIgnoreCase("${os}")) { + } else if (token.equalsIgnoreCase(OS_NAME_VAR_NAME)) { szBuf.append(getOSName()); szBuf.append(File.separator); + } else if (token.startsWith("$")) {//other system env variable + idxS = token.indexOf('{'); + idxE = token.indexOf('}'); + if (idxS!=-1 && idxE!=-1) { + varName = token.substring(idxS+1, idxE); + } + else { + varName = token.substring(1);//skip $ + } + varValue = DBMSAbstractConfigConstants.pruneQuotes(System.getenv(varName)); + if (varValue==null) { + varValue = varName; + } + szBuf.append(Utils.terminatePath(varValue)); } else { szBuf.append(token); szBuf.append(File.separator); diff --git a/src/bzh/plealog/dbmirror/util/runner/DBStampProperties.java b/src/bzh/plealog/dbmirror/util/runner/DBStampProperties.java index c22c668..064eba1 100755 --- a/src/bzh/plealog/dbmirror/util/runner/DBStampProperties.java +++ b/src/bzh/plealog/dbmirror/util/runner/DBStampProperties.java @@ -19,6 +19,7 @@ import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; +import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Properties; @@ -49,6 +50,7 @@ public class DBStampProperties { public static final String DB_SIZE = "size"; // Date formatter public static final SimpleDateFormat BANK_DATE_FORMATTER = new SimpleDateFormat("yyyy-MM-dd, HH:mm"); + private static final SimpleDateFormat BANK_DATE_FORMATTER_DIR = new SimpleDateFormat("yyyy-MM-dd_HH_mm"); private static final Log LOGGER = LogFactory .getLog(DBMSAbstractConfig.KDMS_ROOTLOG_CATEGORY @@ -157,4 +159,19 @@ public static Properties readDBStamp(String dbPath) { return props; } + /** + * Utility method aims at providing a time stamp formatted as YYY-MM-dd_HH_mm. + */ + public static String getDBTimeStampAsDirStr(String dbPath) { + Properties props = readDBStamp(dbPath); + String d = props.getProperty(TIME_STAMP); + String d2 = null; + try { + Date dt = BANK_DATE_FORMATTER.parse(d); + d2 = BANK_DATE_FORMATTER_DIR.format(dt); + } catch (ParseException e) { + // not bad, so we hide this exception + } + return d2; + } } diff --git a/src/bzh/plealog/dbmirror/util/sequence/SeqIOUtils.java b/src/bzh/plealog/dbmirror/util/sequence/SeqIOUtils.java index 6bd5316..5c22511 100755 --- a/src/bzh/plealog/dbmirror/util/sequence/SeqIOUtils.java +++ b/src/bzh/plealog/dbmirror/util/sequence/SeqIOUtils.java @@ -112,31 +112,26 @@ public class SeqIOUtils { + ".SeqIOUtils"); // see DBXrefSplitter class for more details of the syntax - public static final String DEFAULT_CONFIG_XREF_RETRIEVE = "\"DR\" , \"GO\" , \";\" , \";\" , \"GO\" , \":\"\n" - + "\"DR\" , \"InterPro\" , \";\" , \";\" , \"InterPro\" , \"$\"\n" - + "\"DR\" , \"Pfam\" , \";\" , \";\" , \"Pfam\" , \"$\"\n" - + - /* - * Tests on SW - * : Brenda - * code is not - * always - * there! Get - * EC from - * definition - * line! - */ - /* - * "\"DR\" , \"BRENDA\" , \";\" , \";\" , \"EC\" , \"$\"\n" - * + - */ - "\"DE\" , \"EC\" , \"=\" , \";\" , \"EC\" , \"$\"\n" - + "\"OX\" , \"NCBI_TaxID\" , \"=\" , \"};\" , \"taxon\" , \"$\"\n" - + "\"/db_xref=\",\"taxon\", \":\", \"\"\", \"taxon\", \"$\"\n"; - + public static final String DEFAULT_CONFIG_XREF_RETRIEVE = + "\"DR\" , \"GO\" , \";\" , \";\" , \"GO\" , \":\"\n" + + "\"DR\" , \"InterPro\" , \";\" , \";\" , \"InterPro\" , \"$\"\n" + + "\"DR\" , \"Pfam\" , \";\" , \";\" , \"Pfam\" , \"$\"\n" + + "\"DE\" , \"EC\" , \"=\" , \"{;\" , \"EC\" , \"$\"\n" + + "\"OX\" , \"NCBI_TaxID\" , \"=\" , \"};\" , \"taxon\" , \"$\"\n" + + "\"FT|/db_xref=\",\"taxon\" , \":\" , \"\"\" , \"taxon\" , \"$\"\n" /*(1)*/ + + "\"/db_xref=\",\"taxon\" , \":\" , \"\"\" , \"taxon\" , \"$\"\n"; + /*(1): + * Special case for EMBL entry: no OX line, but we have: + * FT /db_xref="taxon:64391" + */ public static DBXrefTagManager XREF_MANAGER = new DBXrefTagManager( DEFAULT_CONFIG_XREF_RETRIEVE); + //SwissProt specific + private static final String SW_DE_RECNAME = "RecName:"; + //TrEMBL specific + private static final String SW_DE_SUBNAME = "SubName:"; + /** * Sets a new configuration for DbXrefTagManager. */ @@ -196,6 +191,31 @@ public static String getId(Sequence seq, boolean ncbiIdType) { return getId(seq.getName(), ncbiIdType); } + public static String reformatUPDescription(String desc) { + // see https://web.expasy.org/docs/userman.html#DE_line + //check if we can retrieve only RecName|SubName Full description + int idx1 = desc.indexOf(SW_DE_RECNAME); + if (idx1==-1) { + idx1 = desc.indexOf(SW_DE_SUBNAME); + } + if (idx1!=-1) { + //TrEMBL sample: DE SubName: Full=Rubrerythrin {ECO:0000313|EMBL:HGW37995.1}; + int idx2 = desc.indexOf("{", idx1); + if (idx2==-1) { + //SW sample: DE RecName: Full=14-3-3 protein beta/alpha; + idx2 = desc.indexOf(";", idx1); + } + //at least "Full=' is always present according to Uniprot manual + idx1 = desc.indexOf("=", idx1); + if (idx2!=-1) { + desc = desc.substring(idx1+1, idx2).trim(); + } + else {//security in case there is no ending ';' or '{' + desc = desc.substring(idx1+1).trim(); + } + } + return desc; + } public static String getDescription(Sequence seq, int seqType, boolean ncbiIdType) { Annotation annot; @@ -367,12 +387,16 @@ public static int guessFileFormat(String fname) { public static void fillDescription(String line, String id, String idDesc, StringBuilder buf) { buf.append(line.substring(idDesc.length()).replace(id, "").trim()); + buf.append(" ");//for multi-line } - public static String cleanDescription(StringBuilder sbDesc) { + public static String cleanDescription(StringBuilder sbDesc, String descKey) { String desc = sbDesc.toString(); if (desc.length() != 0) { - // NCBI mutli-header may contain multiple > + if ("DE".equals(descKey)) { + desc = reformatUPDescription(desc); + } + // NCBI multi-header may contain multiple > desc = Formatters.replaceAll(desc, ">", "|"); // NCBI data may contain null char @@ -719,31 +743,6 @@ private static int[] convertToFasta(InputStream is, OutputStream os, // KDMSMessages.getString("CheckSilvaLicence")); displaySilvaLicense = false; } - // Ludovic Antin 10/02/2015 : silva taxonomy is not the same as ncbi - // taxonomy - // String termId = null; - // - // String[] terms = line.split(";"); - // ArrayUtils.reverse(terms); - // - // for (String termName : terms) { - // termId = dico.getTaxID(termName); - // if (termId != null) - // break; - // } - // - // if (termId != null) { - // line += " "; - // line += DBXrefInstancesManager.HIT_DEF_LINE_START; - // line += DBXrefInstancesManager.TAX_KEY; - // line += - // DBXrefInstancesManager.HIT_DEF_LINE_XREF_NAME_ID_SEPARATOR; - // line += termId.substring(1); // del first character ('n') - // line += DBXrefInstancesManager.HIT_DEF_LINE_STOP; - // } else { - // LoggerCentral.info(LOGGER, "No taxonomy id for : " + line); - // } - } // in case of CDD sequences databank else if (headerFormat == DBUtils.CDD_HEADER_FORMAT) { @@ -832,7 +831,7 @@ else if (line.startsWith(contDescKey) && readDesc) { sequenceHeader.append('>'); sequenceHeader.append(id); sequenceHeader.append(' '); - sequenceHeader.append(SeqIOUtils.cleanDescription(bufDesc)); + sequenceHeader.append(SeqIOUtils.cleanDescription(bufDesc, beginDescKey)); sequenceHeader.append(' '); sequenceHeader.append(instManager.toString()); sequenceHeader.append('\n'); @@ -860,6 +859,7 @@ else if (writeSeq) { } if (Character.isLetter(c)) { bufSeq.append(c); + letterCount++; } else if (c == '*' || c == '-') { bufSeq.append('X'); } @@ -867,7 +867,7 @@ else if (writeSeq) { if (isFastq && ((i + 1) % 80) == 0) { bufSeq.append("\n"); } - letterCount++; + if (!dumpLetters) dumpLetters = true; } diff --git a/src/bzh/plealog/dbmirror/util/xref/DBXrefTagHandler.java b/src/bzh/plealog/dbmirror/util/xref/DBXrefTagHandler.java index c302c32..b9798bc 100755 --- a/src/bzh/plealog/dbmirror/util/xref/DBXrefTagHandler.java +++ b/src/bzh/plealog/dbmirror/util/xref/DBXrefTagHandler.java @@ -18,6 +18,8 @@ import java.util.Hashtable; +import bzh.plealog.dbmirror.util.Utils; + /** * This class is used to handle several DBXrefSplitter that could be associated to * a same db tag. As an example, when considering the DR tag of Uniprot data files, we @@ -26,7 +28,8 @@ * @author Patrick G. Durand */ public class DBXrefTagHandler { - private String tag; + private String tag; + private String tag_b = null; private String begin; private Hashtable splitters; @@ -38,7 +41,11 @@ public class DBXrefTagHandler { */ public DBXrefTagHandler(String tag, String begin) { super(); - this.tag = tag; + String[] tags = Utils.tokenize(tag, "|"); + this.tag = tags[0]; + if (tags.length>1) { + this.tag_b = tags[1]; + } this.begin = begin; splitters = new Hashtable(); } @@ -55,19 +62,27 @@ public void addSplitter(String key, String begin, String end, String code, public String getDbXref(String dataLine) { DBXrefSplitter splitter; String str, key; - int idx, idx2, size; + int idx, idx2, idx3, size; // remove ending spaces str = dataLine.trim(); + size = str.length(); // contains 'tag' ? if (str.startsWith(tag) == false) return null; // skip tag as well as non-letter chars to locate the beginning of key idx = tag.length(); - size = str.length(); // may happen in wrongly annotated files: nothing after a tag ! if (idx >= size) return null; + //Special case for EMBL entry: no OX line, but we have: + // FT /db_xref="taxon:64391" + if (tag_b!=null) { + idx3 = str.indexOf(tag_b, idx); + if (idx3==-1) + return null; + idx = idx3 + tag_b.length(); + } while (!Character.isLetter(str.charAt(idx))) { idx++; if (idx == size) diff --git a/src/test/unit/CmdLineQueryTest.java b/src/test/unit/CmdLineQueryTest.java index 7bf79cd..ab9ad05 100644 --- a/src/test/unit/CmdLineQueryTest.java +++ b/src/test/unit/CmdLineQueryTest.java @@ -107,7 +107,7 @@ protected PrintStream outputFile(File name) throws FileNotFoundException { @Test public void testCmdLine() { String[] args = { - "-d", "protein", + "-d", "p", "-i", "KKCC1_RAT", "-f", "txt"}; @@ -142,7 +142,7 @@ public void testCmdLineFOIDs() { File foIDsFile = new File(UtilsTest.getTestFilePath("Tools", "foIDs.txt")); String[] args = { - "-d", "protein", + "-d", "p", "-i", foIDsFile.getAbsolutePath(), "-f", "txt"}; @@ -294,7 +294,7 @@ public void testCmdLineTaxo() { } String[] args = { - "-d", "dico", + "-d", "d:taxon", "-i", "9606,2157,10116,10090,45351,99999", "-f", "txt", "-o", result.getAbsolutePath()}; diff --git a/src/test/unit/DBXrefInstancesManagerTest.java b/src/test/unit/DBXrefInstancesManagerTest.java index 43ee19d..ea11dc4 100755 --- a/src/test/unit/DBXrefInstancesManagerTest.java +++ b/src/test/unit/DBXrefInstancesManagerTest.java @@ -17,7 +17,9 @@ package test.unit; import java.io.BufferedReader; +import java.io.File; import java.io.FileInputStream; +import java.io.IOException; import java.io.InputStreamReader; import org.junit.After; @@ -27,6 +29,9 @@ import org.junit.BeforeClass; import org.junit.Test; +import com.plealog.genericapp.api.file.EZFileUtils; + +import bzh.plealog.dbmirror.reader.DBUtils; import bzh.plealog.dbmirror.util.sequence.SeqIOUtils; import bzh.plealog.dbmirror.util.xref.DBXrefInstancesManager; import bzh.plealog.dbmirror.util.xref.DBXrefTagManager; @@ -121,4 +126,100 @@ public void testThree() { Assert.assertEquals("[taxon; 10090, GeneID; 83563]", DBXrefInstancesManager .getDbXrefs(result).toString()); } + @Test + public void testFastaConvertor1() { + String fileIn = UtilsTest.getTestFilePath("DBXrefManager", "p12265.dat"); + File result = null; + try { + result = File.createTempFile("dbxref", ".fas"); + result.deleteOnExit(); + } catch (IOException e) { + e.printStackTrace(); + Assert.assertTrue(false);//force test to fail + } + int [] values = SeqIOUtils.convertToFasta(fileIn, result.getAbsolutePath(), + SeqIOUtils.SWISSPROT, null, null, null, DBUtils.NO_HEADER_FORMAT); + Assert.assertEquals(values[0], 1); + Assert.assertEquals(values[1], 648); + String fastaIn = UtilsTest.getTestFilePath("DBXrefManager", "p12265.fas"); + try { + String refFasta = EZFileUtils.getFileContent(new File(fastaIn)); + String newFasta = EZFileUtils.getFileContent(result); + Assert.assertEquals(refFasta, newFasta); + } catch (IOException e) { + Assert.fail(); + } + } + @Test + public void testFastaConvertor2() { + String fileIn = UtilsTest.getTestFilePath("DBXrefManager", "A0A7C4XVR8_9EURY.dat"); + File result = null; + try { + result = File.createTempFile("dbxref", ".fas"); + result.deleteOnExit(); + } catch (IOException e) { + e.printStackTrace(); + Assert.assertTrue(false);//force test to fail + } + int [] values = SeqIOUtils.convertToFasta(fileIn, result.getAbsolutePath(), + SeqIOUtils.SWISSPROT, null, null, null, DBUtils.NO_HEADER_FORMAT); + Assert.assertEquals(values[0], 1); + Assert.assertEquals(values[1], 104); + String fastaIn = UtilsTest.getTestFilePath("DBXrefManager", "A0A7C4XVR8_9EURY.fas"); + try { + String refFasta = EZFileUtils.getFileContent(new File(fastaIn)); + String newFasta = EZFileUtils.getFileContent(result); + Assert.assertEquals(refFasta, newFasta); + } catch (IOException e) { + Assert.fail(); + } + } + @Test + public void testFastaConvertor3() { + String fileIn = UtilsTest.getTestFilePath("DBXrefManager", "multi-up.dat"); + File result = null; + try { + result = File.createTempFile("dbxref", ".fas"); + result.deleteOnExit(); + } catch (IOException e) { + e.printStackTrace(); + Assert.assertTrue(false);//force test to fail + } + int [] values = SeqIOUtils.convertToFasta(fileIn, result.getAbsolutePath(), + SeqIOUtils.SWISSPROT, null, null, null, DBUtils.NO_HEADER_FORMAT); + Assert.assertEquals(values[0], 3); + Assert.assertEquals(values[1], 1403); + String fastaIn = UtilsTest.getTestFilePath("DBXrefManager", "multi-up.fas"); + try { + String refFasta = EZFileUtils.getFileContent(new File(fastaIn)); + String newFasta = EZFileUtils.getFileContent(result); + Assert.assertEquals(refFasta, newFasta); + } catch (IOException e) { + Assert.fail(); + } + } + @Test + public void testFastaConvertor4() { + String fileIn = UtilsTest.getTestFilePath("DBXrefManager", "FK669046.embl"); + File result = null; + try { + result = File.createTempFile("dbxref", ".fas"); + result.deleteOnExit(); + } catch (IOException e) { + e.printStackTrace(); + Assert.assertTrue(false);//force test to fail + } + int [] values = SeqIOUtils.convertToFasta(fileIn, result.getAbsolutePath(), + SeqIOUtils.SWISSPROT, null, null, null, DBUtils.NO_HEADER_FORMAT); + Assert.assertEquals(values[0], 1); + Assert.assertEquals(values[1], 851); + String fastaIn = UtilsTest.getTestFilePath("DBXrefManager", "FK669046.fas"); + try { + String refFasta = EZFileUtils.getFileContent(new File(fastaIn)); + String newFasta = EZFileUtils.getFileContent(result); + Assert.assertEquals(refFasta, newFasta); + } catch (IOException e) { + Assert.fail(); + } + } } diff --git a/tests/junit/DBXrefManager/A0A7C4XVR8_9EURY.dat b/tests/junit/DBXrefManager/A0A7C4XVR8_9EURY.dat new file mode 100644 index 0000000..300ae44 --- /dev/null +++ b/tests/junit/DBXrefManager/A0A7C4XVR8_9EURY.dat @@ -0,0 +1,75 @@ +ID A0A7C4XVR8_9EURY Unreviewed; 104 AA. +AC A0A7C4XVR8; +DT 02-DEC-2020, integrated into UniProtKB/TrEMBL. +DT 02-DEC-2020, sequence version 1. +DT 22-FEB-2023, entry version 9. +DE RecName: Full=Ribonuclease P protein component 2 {ECO:0000256|HAMAP-Rule:MF_00755}; +DE Short=RNase P component 2 {ECO:0000256|HAMAP-Rule:MF_00755}; +DE EC=3.1.26.5 {ECO:0000256|HAMAP-Rule:MF_00755}; +DE AltName: Full=Pop5 {ECO:0000256|HAMAP-Rule:MF_00755}; +GN Name=rnp2 {ECO:0000256|HAMAP-Rule:MF_00755}; +GN ORFNames=ENS21_00425 {ECO:0000313|EMBL:HGJ07577.1}, ENV11_05530 +GN {ECO:0000313|EMBL:HGW38869.1}; +OS Archaeoglobus sp. +OC Archaea; Euryarchaeota; Archaeoglobi; Archaeoglobales; Archaeoglobaceae; +OC Archaeoglobus. +OX NCBI_TaxID=1872626 {ECO:0000313|EMBL:HGW38869.1}; +RN [1] {ECO:0000313|EMBL:HGW38869.1} +RP NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA]. +RC STRAIN=SpSt-47 {ECO:0000313|EMBL:HGJ07577.1}, and SpSt-73 +RC {ECO:0000313|EMBL:HGW38869.1}; +RX PubMed=31911466; +RA Zhou Z., Liu Y., Xu W., Pan J., Luo Z.H., Li M.; +RT "Genome- and Community-Level Interaction Insights into Carbon Utilization +RT and Element Cycling Functions of Hydrothermarchaeota in Hydrothermal +RT Sediment."; +RL mSystems 5:e00795-e00719(2020). +CC -!- FUNCTION: Part of ribonuclease P, a protein complex that generates +CC mature tRNA molecules by cleaving their 5'-ends. {ECO:0000256|HAMAP- +CC Rule:MF_00755}. +CC -!- CATALYTIC ACTIVITY: +CC Reaction=Endonucleolytic cleavage of RNA, removing 5'-extranucleotides +CC from tRNA precursor.; EC=3.1.26.5; Evidence={ECO:0000256|HAMAP- +CC Rule:MF_00755}; +CC -!- SUBUNIT: Consists of a catalytic RNA component and at least 4-5 protein +CC subunits. {ECO:0000256|HAMAP-Rule:MF_00755}. +CC -!- SUBCELLULAR LOCATION: Cytoplasm {ECO:0000256|HAMAP-Rule:MF_00755}. +CC -!- SIMILARITY: Belongs to the eukaryotic/archaeal RNase P protein +CC component 2 family. {ECO:0000256|HAMAP-Rule:MF_00755}. +CC -!- CAUTION: The sequence shown here is derived from an EMBL/GenBank/DDBJ +CC whole genome shotgun (WGS) entry which is preliminary data. +CC {ECO:0000313|EMBL:HGW38869.1}. +CC --------------------------------------------------------------------------- +CC Copyrighted by the UniProt Consortium, see https://www.uniprot.org/terms +CC Distributed under the Creative Commons Attribution (CC BY 4.0) License +CC --------------------------------------------------------------------------- +DR EMBL; DSTZ01000011; HGJ07577.1; -; Genomic_DNA. +DR EMBL; DTFC01000041; HGW38869.1; -; Genomic_DNA. +DR AlphaFoldDB; A0A7C4XVR8; -. +DR GO; GO:0005737; C:cytoplasm; IEA:UniProtKB-SubCell. +DR GO; GO:0030677; C:ribonuclease P complex; IEA:UniProtKB-UniRule. +DR GO; GO:0004526; F:ribonuclease P activity; IEA:UniProtKB-UniRule. +DR GO; GO:0033204; F:ribonuclease P RNA binding; IEA:InterPro. +DR GO; GO:0001682; P:tRNA 5'-leader removal; IEA:UniProtKB-UniRule. +DR Gene3D; 3.30.70.3250; Ribonuclease P, Pop5 subunit; 1. +DR HAMAP; MF_00755; RNase_P_2; 1. +DR InterPro; IPR002759; Pop5/Rpp14/Rnp2-like. +DR InterPro; IPR016819; RNase_P/MRP_POP5. +DR InterPro; IPR038085; Rnp2-like_sf. +DR PANTHER; PTHR15441; RIBONUCLEASE P PROTEIN SUBUNIT P14; 1. +DR PANTHER; PTHR15441:SF2; RIBONUCLEASE P/MRP PROTEIN SUBUNIT POP5; 1. +DR Pfam; PF01900; RNase_P_Rpp14; 1. +DR PIRSF; PIRSF023803; Ribonuclease_P_prd; 1. +DR SUPFAM; SSF160350; Rnp2-like; 1. +PE 3: Inferred from homology; +KW Cytoplasm {ECO:0000256|HAMAP-Rule:MF_00755}; +KW Endonuclease {ECO:0000256|ARBA:ARBA00022759, ECO:0000256|HAMAP- +KW Rule:MF_00755}; +KW Hydrolase {ECO:0000256|ARBA:ARBA00022801, ECO:0000256|HAMAP-Rule:MF_00755}; +KW Nuclease {ECO:0000256|ARBA:ARBA00022722, ECO:0000256|HAMAP-Rule:MF_00755}; +KW tRNA processing {ECO:0000256|ARBA:ARBA00022694, ECO:0000256|HAMAP- +KW Rule:MF_00755}. +SQ SEQUENCE 104 AA; 11415 MW; 39F1CB7E11AC827D CRC64; + MRSRKRYIAF RIINKGSVDE KALSEAMMRN LTALFGEVSA VECGLRLEKF DGERGIVRCN + LEALDRVMIA LTLIDRIGDE SVALLTLGVS GTLKGCKKKL GVLA +// diff --git a/tests/junit/DBXrefManager/A0A7C4XVR8_9EURY.fas b/tests/junit/DBXrefManager/A0A7C4XVR8_9EURY.fas new file mode 100644 index 0000000..be0bd96 --- /dev/null +++ b/tests/junit/DBXrefManager/A0A7C4XVR8_9EURY.fas @@ -0,0 +1,3 @@ +>A0A7C4XVR8_9EURY Ribonuclease P protein component 2 [[taxon:1872626;EC:3.1.26.5;GO:0005737,0030677,0004526,0033204,0001682;Pfam:PF01900;InterPro:IPR002759,IPR016819,IPR038085]] +MRSRKRYIAFRIINKGSVDEKALSEAMMRNLTALFGEVSAVECGLRLEKFDGERGIVRCN +LEALDRVMIALTLIDRIGDESVALLTLGVSGTLKGCKKKLGVLA \ No newline at end of file diff --git a/tests/junit/DBXrefManager/B2R6X2.up b/tests/junit/DBXrefManager/B2R6X2.up index b8549a0..2f7f5d7 100755 --- a/tests/junit/DBXrefManager/B2R6X2.up +++ b/tests/junit/DBXrefManager/B2R6X2.up @@ -1,60 +1,60 @@ -ID B2R6X2_HUMAN Unreviewed; 651 AA. -AC B2R6X2; -DT 01-JUL-2008, integrated into UniProtKB/TrEMBL. -DT 01-JUL-2008, sequence version 1. -DT 10-FEB-2009, entry version 5. -DE SubName: Full=cDNA, FLJ93161, highly similar to Homo sapiens glucuronidase, beta (GUSB), mRNA; -OS Homo sapiens (Human). -OC Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; -OC Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; -OC Catarrhini; Hominidae; Homo. -OX NCBI_TaxID=9606; -RN [1] -RP NUCLEOTIDE SEQUENCE. -RC TISSUE=Spleen; -RA Wakamatsu A., Yamamoto J., Kimura K., Kaida T., Tsuchiya K., Iida Y., -RA Takayama Y., Murakawa K., Kanehori K., Andoh T., Kagawa N., Sato R., -RA Kawamura Y., Tanaka S., Kisu Y., Sugano S., Goshima N., Nomura N., -RA Isogai T.; -RT "NEDO functional analysis of protein and research application -RT project."; -RL Submitted (JAN-2008) to the EMBL/GenBank/DDBJ databases. -CC ----------------------------------------------------------------------- -CC Copyrighted by the UniProt Consortium, see http://www.uniprot.org/terms -CC Distributed under the Creative Commons Attribution-NoDerivs License -CC ----------------------------------------------------------------------- -DR EMBL; AK312752; BAG35619.1; -; mRNA. -DR UniGene; Hs.255230; -. -DR IPI; IPI00027745; -. -DR Ensembl; ENSG00000169919; Homo sapiens. -DR GO; GO:0043169; F:cation binding; IEA:InterPro. -DR GO; GO:0004553; F:hydrolase activity, hydrolyzing O-glycosyl ...; IEA:InterPro. -DR GO; GO:0005975; P:carbohydrate metabolic process; IEA:InterPro. -DR InterPro; IPR006101; Glyco_hydro_2. -DR InterPro; IPR013812; Glyco_hydro_2/20_Ig-like. -DR InterPro; IPR006104; Glyco_hydro_2_carb-bd. -DR InterPro; IPR006102; Glyco_hydro_2_Ig-like. -DR InterPro; IPR006103; Glyco_hydro_2_TIM. -DR InterPro; IPR013781; Glyco_hydro_sub_cat. -DR Gene3D; G3DSA:2.60.40.320; Glyco_hydro_2/20_Ig-like; 1. -DR Gene3D; G3DSA:3.20.20.80; Glyco_hydro_cat; 1. -DR Pfam; PF00703; Glyco_hydro_2; 1. -DR Pfam; PF02836; Glyco_hydro_2_C; 1. -DR Pfam; PF02837; Glyco_hydro_2_N; 1. -DR PRINTS; PR00132; GLHYDRLASE2. -DR PROSITE; PS00719; GLYCOSYL_HYDROL_F2_1; 1. -DR PROSITE; PS00608; GLYCOSYL_HYDROL_F2_2; 1. -PE 2: Evidence at transcript level; -SQ SEQUENCE 651 AA; 74706 MW; 2759150B7EA5A78C CRC64; - MARGSAVAWA ALGPLLWGCA LGLQGGMLYP QESPSRECKE LDGLWSFRAD FSDNRRRGFE - EQWYRRPLWE SGPTVDMPVP SSFNDISQDW RLRHFVGWVW YEREVILPER WTQDLRTRVV - LRIGSAHSYA IVWVNGVDTL EHEGGYLPFE ADISNLVQVG PLPSRLRITI AINNTLTPTT - LPPGTIQYLT DTSKYPKGYF VQNTYFDFFN YAGLQRSVLL YTTPTTYIDD ITVTTSVEQD - SGLVNYQISV KGSNLFKLEV RLLDAENKVV ANGTGTQGQL KVPGVSLWWP YLMHERPAYL - YSLEVQLTAQ TSLGPVSDFY TLPVGIRTVA VTKSQFLING KPFYFHGVNK HEDADIRGKG - FDWPLLVKDF NLLRWLGANA FRTSHYPYAE EVMQMCDRYG IVVIDECPGV GLALPQFFNN - VSLHHHMQVM EEVVRRDKNH PAVVMWSVAN EPASHLESAG YYLKMVIAHT KSLDPSRSVT - FVSNSNYAAD KGAPYVDVIC LNSYYSWYHD YGHLELIQLQ LATQFENWYK KYQKPIIQSE - YGAETIAGFH QDPPLMFTEE YQKSLLEQYH LGLDQKRRKY VVGELIWNFA DFMTEQSPTR - VLGNKKGIFT RQRQPKSAAF LLRERYWKIA NETRYPHSVA KSQCLENSPF T -// +ID B2R6X2_HUMAN Unreviewed; 651 AA. +AC B2R6X2; +DT 01-JUL-2008, integrated into UniProtKB/TrEMBL. +DT 01-JUL-2008, sequence version 1. +DT 10-FEB-2009, entry version 5. +DE SubName: Full=cDNA, FLJ93161, highly similar to Homo sapiens glucuronidase, beta (GUSB), mRNA; +OS Homo sapiens (Human). +OC Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; +OC Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; +OC Catarrhini; Hominidae; Homo. +OX NCBI_TaxID=9606; +RN [1] +RP NUCLEOTIDE SEQUENCE. +RC TISSUE=Spleen; +RA Wakamatsu A., Yamamoto J., Kimura K., Kaida T., Tsuchiya K., Iida Y., +RA Takayama Y., Murakawa K., Kanehori K., Andoh T., Kagawa N., Sato R., +RA Kawamura Y., Tanaka S., Kisu Y., Sugano S., Goshima N., Nomura N., +RA Isogai T.; +RT "NEDO functional analysis of protein and research application +RT project."; +RL Submitted (JAN-2008) to the EMBL/GenBank/DDBJ databases. +CC ----------------------------------------------------------------------- +CC Copyrighted by the UniProt Consortium, see http://www.uniprot.org/terms +CC Distributed under the Creative Commons Attribution-NoDerivs License +CC ----------------------------------------------------------------------- +DR EMBL; AK312752; BAG35619.1; -; mRNA. +DR UniGene; Hs.255230; -. +DR IPI; IPI00027745; -. +DR Ensembl; ENSG00000169919; Homo sapiens. +DR GO; GO:0043169; F:cation binding; IEA:InterPro. +DR GO; GO:0004553; F:hydrolase activity, hydrolyzing O-glycosyl ...; IEA:InterPro. +DR GO; GO:0005975; P:carbohydrate metabolic process; IEA:InterPro. +DR InterPro; IPR006101; Glyco_hydro_2. +DR InterPro; IPR013812; Glyco_hydro_2/20_Ig-like. +DR InterPro; IPR006104; Glyco_hydro_2_carb-bd. +DR InterPro; IPR006102; Glyco_hydro_2_Ig-like. +DR InterPro; IPR006103; Glyco_hydro_2_TIM. +DR InterPro; IPR013781; Glyco_hydro_sub_cat. +DR Gene3D; G3DSA:2.60.40.320; Glyco_hydro_2/20_Ig-like; 1. +DR Gene3D; G3DSA:3.20.20.80; Glyco_hydro_cat; 1. +DR Pfam; PF00703; Glyco_hydro_2; 1. +DR Pfam; PF02836; Glyco_hydro_2_C; 1. +DR Pfam; PF02837; Glyco_hydro_2_N; 1. +DR PRINTS; PR00132; GLHYDRLASE2. +DR PROSITE; PS00719; GLYCOSYL_HYDROL_F2_1; 1. +DR PROSITE; PS00608; GLYCOSYL_HYDROL_F2_2; 1. +PE 2: Evidence at transcript level; +SQ SEQUENCE 651 AA; 74706 MW; 2759150B7EA5A78C CRC64; + MARGSAVAWA ALGPLLWGCA LGLQGGMLYP QESPSRECKE LDGLWSFRAD FSDNRRRGFE + EQWYRRPLWE SGPTVDMPVP SSFNDISQDW RLRHFVGWVW YEREVILPER WTQDLRTRVV + LRIGSAHSYA IVWVNGVDTL EHEGGYLPFE ADISNLVQVG PLPSRLRITI AINNTLTPTT + LPPGTIQYLT DTSKYPKGYF VQNTYFDFFN YAGLQRSVLL YTTPTTYIDD ITVTTSVEQD + SGLVNYQISV KGSNLFKLEV RLLDAENKVV ANGTGTQGQL KVPGVSLWWP YLMHERPAYL + YSLEVQLTAQ TSLGPVSDFY TLPVGIRTVA VTKSQFLING KPFYFHGVNK HEDADIRGKG + FDWPLLVKDF NLLRWLGANA FRTSHYPYAE EVMQMCDRYG IVVIDECPGV GLALPQFFNN + VSLHHHMQVM EEVVRRDKNH PAVVMWSVAN EPASHLESAG YYLKMVIAHT KSLDPSRSVT + FVSNSNYAAD KGAPYVDVIC LNSYYSWYHD YGHLELIQLQ LATQFENWYK KYQKPIIQSE + YGAETIAGFH QDPPLMFTEE YQKSLLEQYH LGLDQKRRKY VVGELIWNFA DFMTEQSPTR + VLGNKKGIFT RQRQPKSAAF LLRERYWKIA NETRYPHSVA KSQCLENSPF T +// diff --git a/tests/junit/DBXrefManager/FK669046.fas b/tests/junit/DBXrefManager/FK669046.fas new file mode 100644 index 0000000..2f8c443 --- /dev/null +++ b/tests/junit/DBXrefManager/FK669046.fas @@ -0,0 +1,16 @@ +>FK669046 634 cowpea buchid larval midgut cDNA library Callosobruchus maculatus cDNA clone GBA similar to Glucosylceramidase, mRNA sequence. [[taxon:64391]] +gagtttgcaaaatgcagtgcctgaagtatgttgtactatttacagttttcagcaagacac +ttgcagaaggatgcctgagtagagactatggaaacggtggtacagtctgcgtttgtaacg +cagaccactgcgataccatagaaccagtgacttcagttgaaaaatcatcttatgtgatct +acacgaccaataaggcagggctcagattgaacaagaaaaccgacaagtttgctactgcaa +aagatgagtacgagaatcaaataactgttggtgaaaaagtgtatcaagaaatacttggtt +ttggtggtgccttcactgactctactggcatcaacattttgtctttgaacgagtctgttc +aagagaagcttctaagatcctatttttccgataatggaatagagtacaacttatgtcgag +tacctataggcggtactgatttttcgactcgcgggtacagctatcatgatgacgttgaag +atgcgagtttgtccaactttaagttgcaagaggaagatcacaaatacaagattccactca +tcaagcgagctgcagcttaccagaatggtctgcagcttttcggttcagcttggtcggcac +caaaatggatgaaagtgcacgatttacctgcggtccattcgggtatttggaagaaaaagt +actatcaagcctgggctgactatcacgtcaaattcttggatgcctatggcaaggaaaata +tcaccttcctggggtatgactacttgaaatgagccattcactgggcttctccggtgccgg +taccttgctgttggatggaactgctcaaagaaacagcgcgttatggggattttgcgcgaa +aaaaaacactc diff --git a/tests/junit/DBXrefManager/multi-up.dat b/tests/junit/DBXrefManager/multi-up.dat new file mode 100644 index 0000000..2b0abf5 --- /dev/null +++ b/tests/junit/DBXrefManager/multi-up.dat @@ -0,0 +1,295 @@ +ID A0A7C4XVR8_9EURY Unreviewed; 104 AA. +AC A0A7C4XVR8; +DT 02-DEC-2020, integrated into UniProtKB/TrEMBL. +DT 02-DEC-2020, sequence version 1. +DT 22-FEB-2023, entry version 9. +DE RecName: Full=Ribonuclease P protein component 2 {ECO:0000256|HAMAP-Rule:MF_00755}; +DE Short=RNase P component 2 {ECO:0000256|HAMAP-Rule:MF_00755}; +DE EC=3.1.26.5 {ECO:0000256|HAMAP-Rule:MF_00755}; +DE AltName: Full=Pop5 {ECO:0000256|HAMAP-Rule:MF_00755}; +GN Name=rnp2 {ECO:0000256|HAMAP-Rule:MF_00755}; +GN ORFNames=ENS21_00425 {ECO:0000313|EMBL:HGJ07577.1}, ENV11_05530 +GN {ECO:0000313|EMBL:HGW38869.1}; +OS Archaeoglobus sp. +OC Archaea; Euryarchaeota; Archaeoglobi; Archaeoglobales; Archaeoglobaceae; +OC Archaeoglobus. +OX NCBI_TaxID=1872626 {ECO:0000313|EMBL:HGW38869.1}; +RN [1] {ECO:0000313|EMBL:HGW38869.1} +RP NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA]. +RC STRAIN=SpSt-47 {ECO:0000313|EMBL:HGJ07577.1}, and SpSt-73 +RC {ECO:0000313|EMBL:HGW38869.1}; +RX PubMed=31911466; +RA Zhou Z., Liu Y., Xu W., Pan J., Luo Z.H., Li M.; +RT "Genome- and Community-Level Interaction Insights into Carbon Utilization +RT and Element Cycling Functions of Hydrothermarchaeota in Hydrothermal +RT Sediment."; +RL mSystems 5:e00795-e00719(2020). +CC -!- FUNCTION: Part of ribonuclease P, a protein complex that generates +CC mature tRNA molecules by cleaving their 5'-ends. {ECO:0000256|HAMAP- +CC Rule:MF_00755}. +CC -!- CATALYTIC ACTIVITY: +CC Reaction=Endonucleolytic cleavage of RNA, removing 5'-extranucleotides +CC from tRNA precursor.; EC=3.1.26.5; Evidence={ECO:0000256|HAMAP- +CC Rule:MF_00755}; +CC -!- SUBUNIT: Consists of a catalytic RNA component and at least 4-5 protein +CC subunits. {ECO:0000256|HAMAP-Rule:MF_00755}. +CC -!- SUBCELLULAR LOCATION: Cytoplasm {ECO:0000256|HAMAP-Rule:MF_00755}. +CC -!- SIMILARITY: Belongs to the eukaryotic/archaeal RNase P protein +CC component 2 family. {ECO:0000256|HAMAP-Rule:MF_00755}. +CC -!- CAUTION: The sequence shown here is derived from an EMBL/GenBank/DDBJ +CC whole genome shotgun (WGS) entry which is preliminary data. +CC {ECO:0000313|EMBL:HGW38869.1}. +CC --------------------------------------------------------------------------- +CC Copyrighted by the UniProt Consortium, see https://www.uniprot.org/terms +CC Distributed under the Creative Commons Attribution (CC BY 4.0) License +CC --------------------------------------------------------------------------- +DR EMBL; DSTZ01000011; HGJ07577.1; -; Genomic_DNA. +DR EMBL; DTFC01000041; HGW38869.1; -; Genomic_DNA. +DR AlphaFoldDB; A0A7C4XVR8; -. +DR GO; GO:0005737; C:cytoplasm; IEA:UniProtKB-SubCell. +DR GO; GO:0030677; C:ribonuclease P complex; IEA:UniProtKB-UniRule. +DR GO; GO:0004526; F:ribonuclease P activity; IEA:UniProtKB-UniRule. +DR GO; GO:0033204; F:ribonuclease P RNA binding; IEA:InterPro. +DR GO; GO:0001682; P:tRNA 5'-leader removal; IEA:UniProtKB-UniRule. +DR Gene3D; 3.30.70.3250; Ribonuclease P, Pop5 subunit; 1. +DR HAMAP; MF_00755; RNase_P_2; 1. +DR InterPro; IPR002759; Pop5/Rpp14/Rnp2-like. +DR InterPro; IPR016819; RNase_P/MRP_POP5. +DR InterPro; IPR038085; Rnp2-like_sf. +DR PANTHER; PTHR15441; RIBONUCLEASE P PROTEIN SUBUNIT P14; 1. +DR PANTHER; PTHR15441:SF2; RIBONUCLEASE P/MRP PROTEIN SUBUNIT POP5; 1. +DR Pfam; PF01900; RNase_P_Rpp14; 1. +DR PIRSF; PIRSF023803; Ribonuclease_P_prd; 1. +DR SUPFAM; SSF160350; Rnp2-like; 1. +PE 3: Inferred from homology; +KW Cytoplasm {ECO:0000256|HAMAP-Rule:MF_00755}; +KW Endonuclease {ECO:0000256|ARBA:ARBA00022759, ECO:0000256|HAMAP- +KW Rule:MF_00755}; +KW Hydrolase {ECO:0000256|ARBA:ARBA00022801, ECO:0000256|HAMAP-Rule:MF_00755}; +KW Nuclease {ECO:0000256|ARBA:ARBA00022722, ECO:0000256|HAMAP-Rule:MF_00755}; +KW tRNA processing {ECO:0000256|ARBA:ARBA00022694, ECO:0000256|HAMAP- +KW Rule:MF_00755}. +SQ SEQUENCE 104 AA; 11415 MW; 39F1CB7E11AC827D CRC64; + MRSRKRYIAF RIINKGSVDE KALSEAMMRN LTALFGEVSA VECGLRLEKF DGERGIVRCN + LEALDRVMIA LTLIDRIGDE SVALLTLGVS GTLKGCKKKL GVLA +// +ID B2R6X2_HUMAN Unreviewed; 651 AA. +AC B2R6X2; +DT 01-JUL-2008, integrated into UniProtKB/TrEMBL. +DT 01-JUL-2008, sequence version 1. +DT 10-FEB-2009, entry version 5. +DE SubName: Full=cDNA, FLJ93161, highly similar to Homo sapiens glucuronidase, beta (GUSB), mRNA; +OS Homo sapiens (Human). +OC Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; +OC Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; +OC Catarrhini; Hominidae; Homo. +OX NCBI_TaxID=9606; +RN [1] +RP NUCLEOTIDE SEQUENCE. +RC TISSUE=Spleen; +RA Wakamatsu A., Yamamoto J., Kimura K., Kaida T., Tsuchiya K., Iida Y., +RA Takayama Y., Murakawa K., Kanehori K., Andoh T., Kagawa N., Sato R., +RA Kawamura Y., Tanaka S., Kisu Y., Sugano S., Goshima N., Nomura N., +RA Isogai T.; +RT "NEDO functional analysis of protein and research application +RT project."; +RL Submitted (JAN-2008) to the EMBL/GenBank/DDBJ databases. +CC ----------------------------------------------------------------------- +CC Copyrighted by the UniProt Consortium, see http://www.uniprot.org/terms +CC Distributed under the Creative Commons Attribution-NoDerivs License +CC ----------------------------------------------------------------------- +DR EMBL; AK312752; BAG35619.1; -; mRNA. +DR UniGene; Hs.255230; -. +DR IPI; IPI00027745; -. +DR Ensembl; ENSG00000169919; Homo sapiens. +DR GO; GO:0043169; F:cation binding; IEA:InterPro. +DR GO; GO:0004553; F:hydrolase activity, hydrolyzing O-glycosyl ...; IEA:InterPro. +DR GO; GO:0005975; P:carbohydrate metabolic process; IEA:InterPro. +DR InterPro; IPR006101; Glyco_hydro_2. +DR InterPro; IPR013812; Glyco_hydro_2/20_Ig-like. +DR InterPro; IPR006104; Glyco_hydro_2_carb-bd. +DR InterPro; IPR006102; Glyco_hydro_2_Ig-like. +DR InterPro; IPR006103; Glyco_hydro_2_TIM. +DR InterPro; IPR013781; Glyco_hydro_sub_cat. +DR Gene3D; G3DSA:2.60.40.320; Glyco_hydro_2/20_Ig-like; 1. +DR Gene3D; G3DSA:3.20.20.80; Glyco_hydro_cat; 1. +DR Pfam; PF00703; Glyco_hydro_2; 1. +DR Pfam; PF02836; Glyco_hydro_2_C; 1. +DR Pfam; PF02837; Glyco_hydro_2_N; 1. +DR PRINTS; PR00132; GLHYDRLASE2. +DR PROSITE; PS00719; GLYCOSYL_HYDROL_F2_1; 1. +DR PROSITE; PS00608; GLYCOSYL_HYDROL_F2_2; 1. +PE 2: Evidence at transcript level; +SQ SEQUENCE 651 AA; 74706 MW; 2759150B7EA5A78C CRC64; + MARGSAVAWA ALGPLLWGCA LGLQGGMLYP QESPSRECKE LDGLWSFRAD FSDNRRRGFE + EQWYRRPLWE SGPTVDMPVP SSFNDISQDW RLRHFVGWVW YEREVILPER WTQDLRTRVV + LRIGSAHSYA IVWVNGVDTL EHEGGYLPFE ADISNLVQVG PLPSRLRITI AINNTLTPTT + LPPGTIQYLT DTSKYPKGYF VQNTYFDFFN YAGLQRSVLL YTTPTTYIDD ITVTTSVEQD + SGLVNYQISV KGSNLFKLEV RLLDAENKVV ANGTGTQGQL KVPGVSLWWP YLMHERPAYL + YSLEVQLTAQ TSLGPVSDFY TLPVGIRTVA VTKSQFLING KPFYFHGVNK HEDADIRGKG + FDWPLLVKDF NLLRWLGANA FRTSHYPYAE EVMQMCDRYG IVVIDECPGV GLALPQFFNN + VSLHHHMQVM EEVVRRDKNH PAVVMWSVAN EPASHLESAG YYLKMVIAHT KSLDPSRSVT + FVSNSNYAAD KGAPYVDVIC LNSYYSWYHD YGHLELIQLQ LATQFENWYK KYQKPIIQSE + YGAETIAGFH QDPPLMFTEE YQKSLLEQYH LGLDQKRRKY VVGELIWNFA DFMTEQSPTR + VLGNKKGIFT RQRQPKSAAF LLRERYWKIA NETRYPHSVA KSQCLENSPF T +// +ID BGLR_MOUSE Reviewed; 648 AA. +AC P12265; Q61601; Q64473; Q64474; +DT 01-OCT-1989, integrated into UniProtKB/Swiss-Prot. +DT 01-OCT-1989, sequence version 1. +DT 19-JAN-2010, entry version 101. +DE RecName: Full=Beta-glucuronidase; +DE EC=3.2.1.31; +DE Flags: Precursor; +GN Name=Gusb; Synonyms=Gus, Gus-s; +OS Mus musculus (Mouse). +OC Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; +OC Mammalia; Eutheria; Euarchontoglires; Glires; Rodentia; Sciurognathi; +OC Muroidea; Muridae; Murinae; Mus. +OX NCBI_TaxID=10090; +RN [1] +RP NUCLEOTIDE SEQUENCE. +RX MEDLINE=88085188; PubMed=2891607; DOI=10.1016/0888-7543(87)90006-1; +RA Gallagher P.M., D'Amore M.A., Lund S.D., Elliott R.W., Pazik J., +RA Hohman C., Korfhagen T.R., Ganschow R.E.; +RT "DNA sequence variation within the beta-glucuronidase gene complex +RT among inbred strains of mice."; +RL Genomics 1:145-152(1987). +RN [2] +RP NUCLEOTIDE SEQUENCE [MRNA]. +RX MEDLINE=88284700; PubMed=3397060; DOI=10.1016/0888-7543(88)90005-5; +RA Gallagher P.M., D'Amore M.A., Lund S.D., Ganschow R.E.; +RT "The complete nucleotide sequence of murine beta-glucuronidase mRNA +RT and its deduced polypeptide."; +RL Genomics 2:215-219(1988). +RN [3] +RP NUCLEOTIDE SEQUENCE [GENOMIC DNA]. +RX MEDLINE=89062453; PubMed=3196706; DOI=10.1021/bi00418a070; +RA D'Amore M.A., Gallagher P.M., Korfhagen T.R., Ganschow R.E.; +RT "Complete sequence and organization of the murine beta-glucuronidase +RT gene."; +RL Biochemistry 27:7131-7140(1988). +RN [4] +RP NUCLEOTIDE SEQUENCE [MRNA]. +RC STRAIN=C3H/HeJ, and YBR; TISSUE=Sperm; +RX MEDLINE=89384641; PubMed=2779578; +RA Wawrzyniak C.J., Gallagher P.M., D'Amore M.A., Carter J.E., Lund S.D., +RA Rinchik E.M., Ganschow R.E.; +RT "DNA determinants of structural and regulatory variation within the +RT murine beta-glucuronidase gene complex."; +RL Mol. Cell. Biol. 9:4074-4078(1989). +RN [5] +RP NUCLEOTIDE SEQUENCE [MRNA]. +RX MEDLINE=88216590; PubMed=2835664; +RA Funkenstein B., Leary S.L., Stein J.C., Catterall J.F.; +RT "Genomic organization and sequence of the Gus-s alpha allele of the +RT murine beta-glucuronidase gene."; +RL Mol. Cell. Biol. 8:1160-1168(1988). +RN [6] +RP NUCLEOTIDE SEQUENCE OF 593-648, VARIANT ARG-642, AND SUBCELLULAR +RP LOCATION. +RX MEDLINE=90368633; PubMed=2394691; +RA Li H., Takeuchi K.H., Manly K., Chapman V., Swank R.T.; +RT "The propeptide of beta-glucuronidase. Further evidence of its +RT involvement in compartmentalization of beta-glucuronidase and sequence +RT similarity with portions of the reactive site region of the serpin +RT superfamily."; +RL J. Biol. Chem. 265:14732-14735(1990). +CC -!- FUNCTION: Plays an important role in the degradation of dermatan +CC and keratan sulfates. +CC -!- CATALYTIC ACTIVITY: A beta-D-glucuronoside + H(2)O = D-glucuronate +CC + an alcohol. +CC -!- ENZYME REGULATION: Inhibited by L-aspartic acid (By similarity). +CC -!- SUBUNIT: Homotetramer. +CC -!- SUBCELLULAR LOCATION: Lysosome. Endoplasmic reticulum. Note=A +CC small proportion is found in the endoplasmic reticulum. +CC -!- SIMILARITY: Belongs to the glycosyl hydrolase 2 family. +CC ----------------------------------------------------------------------- +CC Copyrighted by the UniProt Consortium, see http://www.uniprot.org/terms +CC Distributed under the Creative Commons Attribution-NoDerivs License +CC ----------------------------------------------------------------------- +DR EMBL; J03047; AAA37696.1; -; mRNA. +DR EMBL; J02836; AAA98623.1; -; Genomic_DNA. +DR EMBL; M63836; AAA63309.1; -; mRNA. +DR EMBL; M28540; AAA63307.1; -; mRNA. +DR EMBL; M28541; AAA63308.1; -; mRNA. +DR EMBL; M19279; AAA37697.1; -; mRNA. +DR IPI; IPI00309230; -. +DR PIR; A32576; A32576. +DR RefSeq; NP_034498.1; -. +DR UniGene; Mm.3317; -. +DR SMR; P12265; 22-628. +DR STRING; P12265; -. +DR CAZy; GH2; Glycoside Hydrolase Family 2. +DR PRIDE; P12265; -. +DR Ensembl; ENSMUST00000026613; ENSMUSP00000026613; ENSMUSG00000025534; Mus musculus. +DR GeneID; 110006; -. +DR KEGG; mmu:110006; -. +DR CTD; 110006; -. +DR MGI; MGI:95872; Gusb. +DR eggNOG; roNOG15323; -. +DR HOGENOM; HBG474923; -. +DR HOVERGEN; P12265; -. +DR InParanoid; P12265; -. +DR PhylomeDB; P12265; -. +DR BRENDA; 3.2.1.31; 244. +DR NextBio; 363145; -. +DR ArrayExpress; P12265; -. +DR Bgee; P12265; -. +DR CleanEx; MM_GUSB; -. +DR Genevestigator; P12265; -. +DR GermOnline; ENSMUSG00000025534; Mus musculus. +DR GO; GO:0005783; C:endoplasmic reticulum; IEA:UniProtKB-SubCell. +DR GO; GO:0005764; C:lysosome; IDA:MGI. +DR GO; GO:0005792; C:microsome; IDA:MGI. +DR GO; GO:0004566; F:beta-glucuronidase activity; IDA:MGI. +DR GO; GO:0043169; F:cation binding; IEA:InterPro. +DR GO; GO:0005975; P:carbohydrate metabolic process; IDA:MGI. +DR InterPro; IPR008979; Galactose-bd-like. +DR InterPro; IPR006101; Glyco_hydro_2. +DR InterPro; IPR013812; Glyco_hydro_2/20_Ig-like. +DR InterPro; IPR006104; Glyco_hydro_2_carb-bd. +DR InterPro; IPR006102; Glyco_hydro_2_Ig-like. +DR InterPro; IPR006103; Glyco_hydro_2_TIM. +DR InterPro; IPR017853; Glyco_hydro_catalytic_core. +DR InterPro; IPR013781; Glyco_hydro_sg_catalytic. +DR Gene3D; G3DSA:2.60.40.320; Glyco_hydro_2/20_Ig-like; 1. +DR Gene3D; G3DSA:3.20.20.80; Glyco_hydro_cat; 1. +DR Pfam; PF00703; Glyco_hydro_2; 1. +DR Pfam; PF02836; Glyco_hydro_2_C; 1. +DR Pfam; PF02837; Glyco_hydro_2_N; 1. +DR PRINTS; PR00132; GLHYDRLASE2. +DR PROSITE; PS00719; GLYCOSYL_HYDROL_F2_1; 1. +DR PROSITE; PS00608; GLYCOSYL_HYDROL_F2_2; 1. +PE 2: Evidence at transcript level; +KW Endoplasmic reticulum; Glycoprotein; Glycosidase; Hydrolase; Lysosome; +KW Polymorphism; Signal. +FT SIGNAL 1 22 +FT CHAIN 23 648 Beta-glucuronidase. +FT /FTId=PRO_0000012162. +FT ACT_SITE 447 447 Proton donor (By similarity). +FT CARBOHYD 172 172 N-linked (GlcNAc...) (Potential). +FT CARBOHYD 416 416 N-linked (GlcNAc...) (Potential). +FT CARBOHYD 591 591 N-linked (GlcNAc...) (Potential). +FT CARBOHYD 627 627 N-linked (GlcNAc...) (Potential). +FT VARIANT 87 87 T -> I (in strain: C3H/HeJ). +FT VARIANT 233 233 I -> T (in allele GUS-SA). +FT VARIANT 265 265 D -> G (in strain: YBR and C3H/HeJ). +FT VARIANT 320 320 V -> I (in strain: YBR and C3H/HeJ). +FT VARIANT 428 428 E -> K (in allele GUS-SA). +FT VARIANT 616 616 F -> L (in allele GUS-SA). +FT VARIANT 642 642 G -> R (in allele W26; reduced retention +FT in the endoplasmic reticulum). +SQ SEQUENCE 648 AA; 74239 MW; 3D8C65A5DB3B96D6 CRC64; + MSLKWSACWV ALGQLLCSCA LALKGGMLFP KESPSRELKA LDGLWHFRAD LSNNRLQGFE + QQWYRQPLRE SGPVLDMPVP SSFNDITQEA ALRDFIGWVW YEREAILPRR WTQDTDMRVV + LRINSAHYYA VVWVNGIHVV EHEGGHLPFE ADISKLVQSG PLTTCRITIA INNTLTPHTL + PPGTIVYKTD TSMYPKGYFV QDTSFDFFNY AGLHRSVVLY TTPTTYIDDI TVITNVEQDI + GLVTYWISVQ GSEHFQLEVQ LLDEDGKVVA HGTGNQGQLQ VPSANLWWPY LMHEHPAYMY + SLEVKVTTTE SVTDYYTLPV GIRTVAVTKS KFLINGKPFY FQGVNKHEDS DIRGKGFDWP + LLVKDFNLLR WLGANSFRTS HYPYSEEVLQ LCDRYGIVVI DECPGVGIVL PQSFGNESLR + HHLEVMEELV RRDKNHPAVV MWSVANEPSS ALKPAAYYFK TLITHTKALD LTRPVTFVSN + AKYDADLGAP YVDVICVNSY FSWYHDYGHL EVIQPQLNSQ FENWYKTHQK PIIQSEYGAD + AIPGIHEDPP RMFSEEYQKA VLENYHSVLD QKRKEYVVGE LIWNFADFMT NQSPLRVIGN + KKGIFTRQRQ PKTSAFILRE RYWRIANETG GHGSGPRTQC FGSRPFTF +// diff --git a/tests/junit/DBXrefManager/multi-up.fas b/tests/junit/DBXrefManager/multi-up.fas new file mode 100644 index 0000000..332b560 --- /dev/null +++ b/tests/junit/DBXrefManager/multi-up.fas @@ -0,0 +1,27 @@ +>A0A7C4XVR8_9EURY Ribonuclease P protein component 2 [[taxon:1872626;EC:3.1.26.5;GO:0005737,0030677,0004526,0033204,0001682;Pfam:PF01900;InterPro:IPR002759,IPR016819,IPR038085]] +MRSRKRYIAFRIINKGSVDEKALSEAMMRNLTALFGEVSAVECGLRLEKFDGERGIVRCN +LEALDRVMIALTLIDRIGDESVALLTLGVSGTLKGCKKKLGVLA +>B2R6X2_HUMAN cDNA, FLJ93161, highly similar to Homo sapiens glucuronidase, beta (GUSB), mRNA [[taxon:9606;GO:0043169,0004553,0005975;Pfam:PF00703,PF02836,PF02837;InterPro:IPR006101,IPR013812,IPR006104,IPR006102,IPR006103,IPR013781]] +MARGSAVAWAALGPLLWGCALGLQGGMLYPQESPSRECKELDGLWSFRADFSDNRRRGFE +EQWYRRPLWESGPTVDMPVPSSFNDISQDWRLRHFVGWVWYEREVILPERWTQDLRTRVV +LRIGSAHSYAIVWVNGVDTLEHEGGYLPFEADISNLVQVGPLPSRLRITIAINNTLTPTT +LPPGTIQYLTDTSKYPKGYFVQNTYFDFFNYAGLQRSVLLYTTPTTYIDDITVTTSVEQD +SGLVNYQISVKGSNLFKLEVRLLDAENKVVANGTGTQGQLKVPGVSLWWPYLMHERPAYL +YSLEVQLTAQTSLGPVSDFYTLPVGIRTVAVTKSQFLINGKPFYFHGVNKHEDADIRGKG +FDWPLLVKDFNLLRWLGANAFRTSHYPYAEEVMQMCDRYGIVVIDECPGVGLALPQFFNN +VSLHHHMQVMEEVVRRDKNHPAVVMWSVANEPASHLESAGYYLKMVIAHTKSLDPSRSVT +FVSNSNYAADKGAPYVDVICLNSYYSWYHDYGHLELIQLQLATQFENWYKKYQKPIIQSE +YGAETIAGFHQDPPLMFTEEYQKSLLEQYHLGLDQKRRKYVVGELIWNFADFMTEQSPTR +VLGNKKGIFTRQRQPKSAAFLLRERYWKIANETRYPHSVAKSQCLENSPFT +>BGLR_MOUSE Beta-glucuronidase [[taxon:10090;EC:3.2.1.31;GO:0005783,0005764,0005792,0004566,0043169,0005975;Pfam:PF00703,PF02836,PF02837;InterPro:IPR008979,IPR006101,IPR013812,IPR006104,IPR006102,IPR006103,IPR017853,IPR013781]] +MSLKWSACWVALGQLLCSCALALKGGMLFPKESPSRELKALDGLWHFRADLSNNRLQGFE +QQWYRQPLRESGPVLDMPVPSSFNDITQEAALRDFIGWVWYEREAILPRRWTQDTDMRVV +LRINSAHYYAVVWVNGIHVVEHEGGHLPFEADISKLVQSGPLTTCRITIAINNTLTPHTL +PPGTIVYKTDTSMYPKGYFVQDTSFDFFNYAGLHRSVVLYTTPTTYIDDITVITNVEQDI +GLVTYWISVQGSEHFQLEVQLLDEDGKVVAHGTGNQGQLQVPSANLWWPYLMHEHPAYMY +SLEVKVTTTESVTDYYTLPVGIRTVAVTKSKFLINGKPFYFQGVNKHEDSDIRGKGFDWP +LLVKDFNLLRWLGANSFRTSHYPYSEEVLQLCDRYGIVVIDECPGVGIVLPQSFGNESLR +HHLEVMEELVRRDKNHPAVVMWSVANEPSSALKPAAYYFKTLITHTKALDLTRPVTFVSN +AKYDADLGAPYVDVICVNSYFSWYHDYGHLEVIQPQLNSQFENWYKTHQKPIIQSEYGAD +AIPGIHEDPPRMFSEEYQKAVLENYHSVLDQKRKEYVVGELIWNFADFMTNQSPLRVIGN +KKGIFTRQRQPKTSAFILRERYWRIANETGGHGSGPRTQCFGSRPFTF diff --git a/tests/junit/DBXrefManager/p12265.fas b/tests/junit/DBXrefManager/p12265.fas new file mode 100644 index 0000000..ea9a64c --- /dev/null +++ b/tests/junit/DBXrefManager/p12265.fas @@ -0,0 +1,12 @@ +>BGLR_MOUSE Beta-glucuronidase [[taxon:10090;EC:3.2.1.31;GO:0005783,0005764,0005792,0004566,0043169,0005975;Pfam:PF00703,PF02836,PF02837;InterPro:IPR008979,IPR006101,IPR013812,IPR006104,IPR006102,IPR006103,IPR017853,IPR013781]] +MSLKWSACWVALGQLLCSCALALKGGMLFPKESPSRELKALDGLWHFRADLSNNRLQGFE +QQWYRQPLRESGPVLDMPVPSSFNDITQEAALRDFIGWVWYEREAILPRRWTQDTDMRVV +LRINSAHYYAVVWVNGIHVVEHEGGHLPFEADISKLVQSGPLTTCRITIAINNTLTPHTL +PPGTIVYKTDTSMYPKGYFVQDTSFDFFNYAGLHRSVVLYTTPTTYIDDITVITNVEQDI +GLVTYWISVQGSEHFQLEVQLLDEDGKVVAHGTGNQGQLQVPSANLWWPYLMHEHPAYMY +SLEVKVTTTESVTDYYTLPVGIRTVAVTKSKFLINGKPFYFQGVNKHEDSDIRGKGFDWP +LLVKDFNLLRWLGANSFRTSHYPYSEEVLQLCDRYGIVVIDECPGVGIVLPQSFGNESLR +HHLEVMEELVRRDKNHPAVVMWSVANEPSSALKPAAYYFKTLITHTKALDLTRPVTFVSN +AKYDADLGAPYVDVICVNSYFSWYHDYGHLEVIQPQLNSQFENWYKTHQKPIIQSEYGAD +AIPGIHEDPPRMFSEEYQKAVLENYHSVLDQKRKEYVVGELIWNFADFMTNQSPLRVIGN +KKGIFTRQRQPKTSAFILRERYWRIANETGGHGSGPRTQCFGSRPFTF \ No newline at end of file