dipper-etl.py

#!/usr/bin/env python3

import argparse
import logging
import unittest
import importlib
import time
from tests.test_general import GeneralGraphTestCase
# from dipper.utils.TestUtils import TestUtils
from dipper.utils.GraphUtils import GraphUtils

logging.basicConfig()
LOG = logging.getLogger(__name__)

TEST_SUITE = unittest.TestLoader().loadTestsFromTestCase(GeneralGraphTestCase)


def main():
    # TODO this should be generated by looking in the dipper/sources directory
    source_to_class_map = {
        # 'facebase_alpha': 'FaceBase_alpha',
        'hpoa': 'HPOAnnotations',
        'zfin': 'ZFIN',
        'omim': 'OMIM',
        'biogrid': 'BioGrid',
        'mgi': 'MGI',
        'impc': 'IMPC',
        'panther': 'Panther',
        'ncbigene': 'NCBIGene',
        'ucscbands': 'UCSCBands',
        'ctd': 'CTD',
        'genereviews': 'GeneReviews',
        'eom': 'EOM',
        'coriell': 'Coriell',
        # 'clinvar': 'ClinVar',      # needs integrating here
        'monochrom': 'Monochrom',
        'kegg': 'KEGG',
        'animalqtldb': 'AnimalQTLdb',
        'ensembl': 'Ensembl',
        'hgnc': 'HGNC',
        'orphanet': 'Orphanet',
        'omia': 'OMIA',
        'flybase': 'FlyBase',
        'mmrrc': 'MMRRC',
        'wormbase': 'WormBase',
        'mpd': 'MPD',
        'gwascatalog': 'GWASCatalog',
        'monarch': 'Monarch',
        'go': 'GeneOntology',
        'reactome': 'Reactome',
        'udp': 'UDP',
        'mgislim': 'MGISlim',
        'zfinslim': 'ZFINSlim',
        'bgee': 'Bgee',
        'mydrug': 'MyDrug',
        'stringdb': 'StringDB',
        'rgd': 'RGD',
        'sgd': 'SGD',
        'mychem': 'MyChem',
        'ebi': 'EBIGene2Phen',
        'xenbase': 'Xenbase'
    }

    parser = argparse.ArgumentParser(
        description='Dipper: Data Ingestion Pipeline for Monarch',
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument(
        '-g', '--graph', type=str, default="rdf_graph",
        help='graph type: rdf_graph, streamed_graph')
    parser.add_argument(
        '-s', '--sources', type=str, default='?',
        help='comma separated list of sources')
    parser.add_argument('-l', '--limit', type=int, help='limit number of rows used')
    parser.add_argument(
        '--parse_only', action='store_true', help='parse files without writing RDF')
    parser.add_argument(
        '--fetch_only', action='store_true', help='fetch sources without parsing')
    parser.add_argument(
        '-f', '--force', action='store_true', help='force re-download of files')
    parser.add_argument(
        '--no_verify', help='ignore the verification step', action='store_true')
    # parser.add_argument( '--query', help='enter in a sparql query', type=str)
    parser.add_argument(
        '-q', '--quiet', help='turn off info logging', action="store_true")
    parser.add_argument(
        '--debug', help='turn on debug logging', action="store_true")
    parser.add_argument(
        '--skip_tests', help='skip any testing', action="store_true")
    # Blank Nodes can't be visualized in Protege, default to Skolemizing them
    parser.add_argument(
        '-b', '--use_bnodes',
        help="use blank nodes instead of skolemizing", action="store_true",
        default=False)
    #
    parser.add_argument(  # TODO help needs revisiting, push constraints off the the src
        '-t', '--taxon', type=str, help='''
            Constrain Source to supplied taxon identifier(s).
            Please enter comma delimited NCBITaxon numbers:
            Implemented taxa per source
            NCBIGene: 9606,10090,7955
            Panther: 9606,10090,10116,7227,7955,6239,8355
            BioGrid: 9606,10090,10116,7227,7955,6239,8355
            UCSCBands: 9606
            GO: 9606,10090,10116,7227,7955,6239,9615,9823,9031,9913,4896,5782,5052
    ''')
    parser.add_argument(
        '-o', '--test_only',
        help='only process and output the pre-configured test subset',
        action="store_true")

    parser.add_argument(
        '--dest_fmt',
        help='serialization format: [turtle], nt, nquads, rdfxml, n3, raw', type=str)

    parser.add_argument(
        '-v', '--version', help='version of source (deprecated)', type=str)

    parser.add_argument(
        '-d', '--data_release_version',
        help='''
            string indicating the version of data release, e.g. '\'201908\' (YYYYMM),
            used to construct metadata, including version and distribution IRIs
            and downloadURLs
            [defaults to date at start of runtime in ISO 8601 format]
        ''',
        type=str)

    args = parser.parse_args()
    tax_ids = None
    if args.taxon is not None:
        tax_ids = [str(t) for t in args.taxon.split(',') if t.isdigit()]

    species_specific = [
        'Panther', 'NCBIGene', 'BioGrid', 'UCSCBands',
        'GeneOntology', 'Bgee', 'StringDB', 'Ensembl']

    formats_supported = [
        'turtle', 'ttl',
        'ntriples', 'nt',
        'nquads', 'nq',
        'rdfxml', 'xml',
        'notation3', 'n3',
        'raw']

    if args.quiet:
        logging.getLogger().setLevel(logging.WARNING)
    else:
        if args.debug:
            logging.getLogger().setLevel(logging.DEBUG)
        else:
            logging.getLogger().setLevel(logging.INFO)

    if not args.use_bnodes:
        LOG.info("Will Skolemize Blank Nodes")

    # None of these query test utils exist in ./dipper/utils/TestUtils.py
    # if args.query is not None:
    #    test_query = TestUtils()
    #    for source in args.sources.split(','):
    #        source = source.lower()
    #        mysource = source_to_class_map[source]()
    #        # import source lib
    #        module = "dipper.sources.{0}".format(mysource)
    #        imported_module = importlib.import_module(module)
    #        source_class = getattr(imported_module, mysource)
    #        test_query.check_query_syntax(args.query, source_class)
    #        test_query.load_graph_from_turtle(source_class)
    #
    #    print(test_query.query_graph(args.query, True))
    #    exit(0)

    # run initial tests
    if (args.no_verify or args.skip_tests) is not True:
        unittest.TextTestRunner(verbosity=2).run(TEST_SUITE)

    # set serializer
    if args.dest_fmt is not None:
        if args.dest_fmt in formats_supported:
            if args.dest_fmt == 'ttl':
                args.dest_fmt = 'turtle'
            elif args.dest_fmt == 'ntriples':
                args.dest_fmt = 'nt'
            elif args.dest_fmt == 'nq':
                args.dest_fmt = 'nquads'
            elif args.dest_fmt == 'xml':
                args.dest_fmt = 'rdfxml'
            elif args.dest_fmt == 'notation3':
                args.dest_fmt = 'n3'
        else:
            LOG.error("You have specified an invalid serializer: %s", args.dest_fmt)
            exit(0)
    else:
        args.dest_fmt = 'turtle'

    # Provide feedback if we can't proceed
    if args.sources is None or args.sources.split(',')[0] not in source_to_class_map:
        LOG.info('Unknown Source %s', args.sources.split(',')[0])
        LOG.info('Sources Known are limited to:')
        for key in sorted(source_to_class_map):
            LOG.info('\t%s\t%s', key, source_to_class_map[key])
        exit(0)

    # iterate through all the sources
    for source in args.sources.split(','):
        LOG.info("\n******* %s *******", source)
        source = source.lower()
        src = source_to_class_map[source]

        # import source lib
        module = "dipper.sources.{0}".format(src)
        imported_module = importlib.import_module(module)
        source_class = getattr(imported_module, src)
        mysource = None

        LOG.info(
            'Command line arguments available to dipper-etl:\n%s',
            "\n".join(['\t{}: {}'.format(k, v) for k, v in vars(args).items()]))

        source_args = dict(graph_type=args.graph)
        source_args['are_bnodes_skolemized'] = not args.use_bnodes
        if src in species_specific:
            source_args['tax_ids'] = tax_ids
        if args.version:
            source_args['version'] = args.version
        if args.data_release_version:
            source_args['data_release_version'] = args.data_release_version

        mysource = source_class(**source_args)

        # WIP cli args should be available to source
        if hasattr(mysource, 'ARGV'):
            mysource.ARGV = vars(args)
        else:
            LOG.error('no where to to put args in %s', mysource.__class__)

        if args.parse_only is False:
            start_fetch = time.perf_counter()
            mysource.fetch(args.force)

            end_fetch = time.perf_counter()
            LOG.info("Fetching time: %d sec", end_fetch - start_fetch)

        mysource.settestonly(args.test_only)

        # create source ingest graph first (with pristine arguments)
        if args.test_only is False and args.fetch_only is False:
            start_parse = time.perf_counter()
            mysource.parse(args.limit)

            end_parse = time.perf_counter()
            LOG.info("Parsing time: %d sec", end_parse - start_parse)

            if args.graph == 'rdf_graph':
                LOG.info("Found %d nodes", len(mysource.graph))

                # Add property axioms
                start_axiom_exp = time.perf_counter()
                LOG.info("Adding property axioms")

                properties = GraphUtils.get_properties_from_graph(mysource.graph)
                GraphUtils.add_property_axioms(mysource.graph, properties)
                LOG.info(
                    "Property axioms added: %d sec",
                    time.perf_counter() - start_axiom_exp)

                start_write = time.perf_counter()
                mysource.write(fmt=args.dest_fmt)
                LOG.info("Writing time: %d sec", time.perf_counter() - start_write)
            # elif args.graph == 'streamed_graph': ...

        # '*_test.ttl' graphs if requested
        if (args.no_verify or args.skip_tests) is False:
            suite = mysource.getTestSuite()
            if suite is None:
                LOG.warning("No tests configured for this source: %s", source)
            else:
                unittest.TextTestRunner(verbosity=2).run(suite)
        else:
            LOG.info("Skipping Tests for source: %s", source)

        LOG.info('***** Finished with %s *****', source)

    LOG.info("All done.")

if __name__ == "__main__":
    main()

###########################