From d0136f0cb01016660a993c2da96d85039a80190c Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Fri, 5 Jan 2018 14:23:28 -0700 Subject: [PATCH 001/101] Starting work on scrmpat, which will tabulate site pattern frequencies from scrm output. --- src/scrmpat.c | 646 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 646 insertions(+) create mode 100644 src/scrmpat.c diff --git a/src/scrmpat.c b/src/scrmpat.c new file mode 100644 index 00000000..03226b65 --- /dev/null +++ b/src/scrmpat.c @@ -0,0 +1,646 @@ +/** +@file scrmpat.c +@page scrmpat +@brief Tabulate site pattern frequencies from .daf files. + +# Scrmpat: tabulates site patterns + +Scrmpat reads data generated by scrm (with option -transpose-segsites) +tabulates counts of nucleotide site patterns, writing the result to +standard output. Optionally, it also calculates a moving-blocks +bootstrap, writing each bootstrap replicate into a separate file. + +# Usage + + Usage: scrmpat [options] ... where and are + arbitrary labels, which refer to the populations in the input + data. The number and order of these labels must agree with + those specified on the scrm command line (using scrm arguments + -I and/or -eI). Labels may not include the character + ":". Maximum number of input files: 32. Writes to standard + output. + + Bootstrap output is available only if input comes from a file + rather than from standard input. + + Options may include: + --infile + Input file name. Def: standard input + --bootfile + Bootstrap output file basename. Def: scrmpat.boot. + -r or --bootreps + # of bootstrap replicates. Def: 0 + -b or --blocksize + # of SNPs per block in moving-blocks bootstrap. Def: 0. + -F or --logFixed + log fixed sites to scrmpat.log + -a or --logAll + log all sites to scrmpat.log + -h or --help + Print this message + +# Example + +`scrmpat` parses a file generated using `scrm`. The `scrm` command +should include the option `-transpost-segsites`. Let us assume you +have done this, that file `foo.scrm` contains the output simulated +by `scrm`, and that these simulated data included genotypes referring +to four populations, labelled "x", "y", "n", and "d". The `scrmpat` +command woule look like this: +~/daf contains a separate daf file for each population. We want to +compare 4 populations, whose .daf files are `yri.daf`, `ceu.daf`, +`altai.daf`, and `denisova.daf`. The following command will do this, +putting the results into `obs.txt`. + + scrmpat --infile foo.scrm x y n d + +In the output, site pattern "x:y" refers to +the pattern in which the derived allele is present haploid samples +from "x" and "y" but not on those from other populations. The order of +the command-line arguments determines the order in which labels are +sorted on output. Given the command line above, we would get a site +pattern labeled "x:y:d" rather than, say, "y:x:d". + +The output looks like this: + + # Number of site patterns: 10 + # Tabulated 12327755 SNPs + # SitePat E[count] + x:y 340952.4592501 + x:n 46874.1307236 + x:d 46034.4670204 + y:n 55137.4236715 + y:d 43535.5248078 + n:d 231953.3372578 + x:y:n 91646.1277991 + x:y:d 88476.9619569 + x:n:d 96676.3877423 + y:n:d 100311.4411513 + +The left column lists the site patterns that occur in the data. The +right column gives the expected count of each site pattern. These are +not integers, because they represent averages over all possible +subsamples consisting of a single haploid genome from each +population. + +To generate a bootstrap, use the `--bootreps` option: + + scrmpat --bootreps 50 --infile foo.scrm x y n d > obs.txt + +This will generate not only the primary output file, `obs.txt`, but also +50 additional files, each representing a single bootstrap +replicate. The primary output file now has a bootstrap confidence +interval: + + # Including singleton site patterns. + # Number of site patterns: 10 + # Tabulated 12327755 SNPs + # bootstrap output file = scrmpat.boot + # confidence level = 95% + # SitePat E[count] loBnd hiBnd + x:y 340952.4592501 338825.6604586 342406.6670816 + x:n 46874.1307236 46361.5798377 47438.1857029 + x:d 46034.4670204 45605.6588012 46631.6434277 + y:n 55137.4236715 54650.0763578 55783.7051253 + y:d 43535.5248078 43110.5119922 44234.0919024 + n:d 231953.3372578 229495.3741057 234173.6878092 + x:y:n 91646.1277991 90494.0219749 92873.4443706 + x:y:d 88476.9619569 87137.1867967 89585.8431419 + x:n:d 96676.3877423 95935.5184294 97417.6241185 + y:n:d 100311.4411513 99292.9839140 101163.3457462 + +Here, `loBnd` and `hiBnd` are the limits of a 95% confidence +interval. The bootstrap output files look like `scrmpat.boot000`, +`scrmpat.boot001`, and so on. + +@copyright Copyright (c) 2018, Alan R. Rogers +. This file is released under the Internet +Systems Consortium License, which can be found in file "LICENSE". +*/ + +#include "binary.h" +#include "boot.h" +#include "misc.h" +#include "strint.h" +#include "typedefs.h" +#include "version.h" +#include "error.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAXCHR 24 // maximum number of chromosomes + +typedef struct Stack Stack; + +/// Treat a vector of tipId_t values as a push-down stack. +struct Stack { + int dim, nused; + tipId_t *buff; // not locally owned +}; + +static void usage(void); +static Stack *Stack_new(int dim, tipId_t buff[dim]); +static void Stack_free(Stack * stk); +static void Stack_push(Stack * self, tipId_t x); +static void generatePatterns(int bit, int npops, Stack * stk, tipId_t pat, + int doSing); + +const char *useMsg = + "\nUsage: scrmpat [options] ...\n" + " where and are arbitrary labels, whose number and order must\n" + " agree with that of the populations specified in the scrm command.\n" + " Writes to standard output. Labels may not include\n" + " the character \":\"."; + +/// Print usage message and die. +static void usage(void) { + fputs(useMsg, stderr); + fprintf(stderr, " Maximum number of input files: %lu.\n", + 8 * sizeof(tipId_t)); + fputs("\nOptions may include:\n", stderr); + tellopt("--infile ", + "Input file name. Def: standard input"); + tellopt("--bootfile ", + "Bootstrap output file basename. Def: scrmpat.boot."); + tellopt("-r or --bootreps ", "# of bootstrap replicates. Def: 0"); + tellopt("-b or --blocksize ", + "# of SNPs per block in moving-blocks bootstrap. Def: 0."); + tellopt("-F or --logFixed", "log fixed sites to scrmpat.log"); + tellopt("-a or --logAll", "log all sites to scrmpat.log"); + tellopt("--version", "Print version and exit"); + tellopt("-h or --help", "Print this message"); + exit(1); +} + +/// This stack is local to this file. It provides a bounds-controlled +/// interface to an external array, which is passed as an argument, buff, +/// to Stack_new. +static Stack *Stack_new(int dim, tipId_t buff[dim]) { + Stack *self = malloc(sizeof(Stack)); + CHECKMEM(self); + self->dim = dim; + self->buff = buff; + self->nused = 0; + return self; +} + +/// Frees the stack but not the underlying buffer. +static void Stack_free(Stack * stk) { + free(stk); +} + +/// Add an entry to the stack, checking bounds. +static void Stack_push(Stack * self, tipId_t x) { + if(self->nused == self->dim) { + fprintf(stderr, "%s:%s:%d ERR: buffer overflow\n", + __FILE__, __func__, __LINE__); + exit(EXIT_FAILURE); + } + self->buff[self->nused++] = x; +} + +/// Call as generatePatterns(0, npops, stk, 0); Recursive function, +/// which generates all legal site patterns and pushes them onto a +/// stack. +static void +generatePatterns(int bit, int npops, Stack * stk, tipId_t pat, int doSing) { + assert(sizeof(tipId_t) < sizeof(unsigned long long)); + if(bit == npops) { + // Recursion stops here. If current pattern is + // legal, then push it onto the stack. Then return. + + // Exclude patterns with all bits on, or all bits off. + if(pat == 0 || pat == (1ULL << npops) - 1ULL) + return; + // Exclude singleton patterns unless "doSing" is true. + if(!doSing && isPow2(pat)) + return; + Stack_push(stk, pat); + return; + } + tipId_t on = 1UL << bit; + generatePatterns(bit + 1, npops, stk, pat | on, doSing); // curr bit on + generatePatterns(bit + 1, npops, stk, pat, doSing); // curr bit off +} + +int main(int argc, char **argv) { + int i, j, status, optndx, done; + int doSing = 1; // nonzero means use singleton site patterns + long bootreps = 0; + double conf = 0.95; // confidence level + long blocksize = 500; + StrInt *strint = StrInt_new(); + char bootfname[FILENAMESIZE] = { '\0' }; + char errbuff[100] = { '\0' }; + const char *logfname = "scrmpat.log"; + int logFixed = 0, logAll = 0; + FILE *logfile = NULL; + FILE *ifp = stdin; + + static struct option myopts[] = { + // {char *name, int has_arg, int *flag, int val} + {"infile", required_argument, 0, 'i'}, + {"bootfile", required_argument, 0, 'f'}, + {"bootreps", required_argument, 0, 'r'}, + {"blocksize", required_argument, 0, 'b'}, + {"logFixed", no_argument, 0, 'F'}, + {"logAll", no_argument, 0, 'a'}, + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'V'}, + {NULL, 0, NULL, 0} + }; + + // command line arguments + for(;;) { + i = getopt_long(argc, argv, "ab:c:f:hi:r:t:Fv", myopts, &optndx); + if(i == -1) + break; + switch (i) { + case ':': + case '?': + usage(); + break; + case 'b': + blocksize = strtod(optarg, NULL); + if(blocksize <= 0) { + fprintf(stderr, + "%s:%d: bad argument to -b or --blocksize: \"%s\"\n", + __FILE__, __LINE__, optarg); + usage(); + } + break; + case 'f': + status = snprintf(bootfname, sizeof bootfname, "%s", optarg); + if(status >= sizeof bootfname) { + fprintf(stderr, "%s:%d: ERR: Filename %s is too large." + " Max: %zu\n", + __FILE__, __LINE__, optarg, sizeof(bootfname) - 1); + exit(EXIT_FAILURE); + } + break; + case 'h': + usage(); + break; + case 'i': + ifp = fopen(optarg, "r"); + if(ifp==NULL) { + fprintf(stderr,"%s:%d: can't open %s for input.\n", + __FILE__,__LINE__, optarg); + exit(EXIT_FAILURE); + } + break; + case 'V': + printf("scrmpat version %s\n", VERSION); + return 0; + case 'r': + bootreps = strtol(optarg, NULL, 10); + break; + case 'F': + logFixed = 1; + break; + case 'a': + logAll = 1; + break; + default: + usage(); + } + } + + // remaining options: population labels + int n = argc - optind; // number of input files + if(n == 0) + usage(); + + char *poplbl[n]; + LblNdx lndx; + LblNdx_init(&lndx); + + // Number of inputs can't exceed number of bits in an object of + // type tipId_t. + if(n > 8 * sizeof(tipId_t)) { + fprintf(stderr, "Error: %d populations. Max is %lu.\n", + n, 8 * sizeof(tipId_t)); + usage(); + } + // Parse remaining arguments, each of which should be an arbitrary + // label. + for(i = 0; i < n; ++i) { + fname[i] = poplbl[i] = argv[i + optind]; + if(poplbl[i] == NULL + || strlen(poplbl[i]) == 0 + || strchr(poplbl[i], ':') != NULL) + usage(); + LblNdx_addSamples(&lndx, 1, poplbl[i]); + } + + if(logFixed || logAll) { + logfile = fopen(logfname, "w"); + if(logfile == NULL) { + fprintf(stderr, "Can't write to file \"%s\".\n", logfname); + exit(EXIT_FAILURE); + } + } + if(ifp==stdin && (bootreps>0 || bootfname[0] != '\0')) { + fprintf(stderr, "%s:%s: Can't do bootstrap when input is stdin.\n", + __FILE__,__LINE__); + exit(EXIT_FAILURE); + } + // Default boot file name + if(bootfname[0] == '\0') { + const char *defName = "scrmpat.boot"; + status = snprintf(bootfname, sizeof bootfname, "%s", defName); + if(status >= sizeof bootfname) { + fprintf(stderr, "%s:%d: ERR: Filename %s is too large." + " Max: %zu\n", + __FILE__, __LINE__, defName, sizeof(bootfname) - 1); + exit(EXIT_FAILURE); + } + } + + printf("# scrmpat version %s\n", VERSION); + printf("# Population labels:\n"); + for(i = 0; i < n; ++i) + printf("# %4s=%s\n", poplbl[i], fname[i]); + + // make sure labels are all different + for(i = 1; i < n; ++i) + for(j = 0; j < i; ++j) + if(0 == strcmp(poplbl[i], poplbl[j])) { + fprintf(stderr, "ERR: duplicate labels on command line.\n"); + fprintf(stderr, " duplicated label: %s\n", poplbl[i]); + exit(EXIT_FAILURE); + } + + unsigned long npat = (1UL << n) - 2UL; // number of site patterns + if(!doSing) + npat -= n; + printf("# %s singleton site patterns.\n", + (doSing ? "Including" : "Excluding")); + printf("# Number of site patterns: %lu\n", npat); + tipId_t pat[npat]; + double patCount[npat]; + int lblsize = 100; + char lblbuff[lblsize]; + memset(patCount, 0, sizeof(patCount)); + + { + // Stack is a interface to array "pat". + Stack *stk = Stack_new(npat, pat); + + // Put site patterns into array "pat". + generatePatterns(0, n, stk, 0, doSing); + + Stack_free(stk); + } + + // Sort site patterns. Major sort is by number of "on" bits, + // so that doubleton patterns come first, then tripletons, ets. + // Secondary sort is by order in which labels are listed + // on the command line. + qsort(pat, (size_t) npat, sizeof(pat[0]), compare_tipId); + fflush(stdout); + + // Used by bootstrap + Boot *boot = NULL; + int nchr = 0; + char prev[DAFSTRSIZE], chr[DAFSTRSIZE] = { '\0' }; + long nsnp[MAXCHR]; + memset(nsnp, 0, sizeof nsnp); + + // Read the data to get dimensions: number of chromosomes and + // number of snps per chromosome. Then use these dimensions to + // allocate a bootstrap object. + if(bootreps > 0) { + fprintf(stderr, "Doing 1st pass through data to get dimensions...\n"); + + // First pass through data sets values of + // nchr + // nsnp[i] {i=0..nchr-1} + done = 0; + while(!done) { + status = DAFReader_multiNext(n, r); + switch(status) { + case 0: + break; + case EOF: + done=1; + continue; + case ALLELE_MISMATCH: + case NO_ANCESTRAL_ALLELE: + continue; + default: + // something wrong + mystrerror_r(status, errbuff, sizeof errbuff); + fprintf(stderr,"%s:%d: input error (%s)\n", + __FILE__,__LINE__, errbuff); + exit(EXIT_FAILURE); + } + + assert(strlen(DAFReader_chr(r[0])) < sizeof prev); + strcpy(prev, chr); + strcpy(chr, DAFReader_chr(r[0])); + int diff = strcmp(prev, chr); + if(diff != 0) { + StrInt_insert(strint, chr, nchr); + nsnp[nchr] = 1; + ++nchr; + } else + ++nsnp[nchr - 1]; + } + + for(i = 0; i < n; ++i) { + status = DAFReader_rewind(r[i]); + if(status) { + fprintf(stderr, "%s:%d: ERR: can't rewind input stream.\n", + __FILE__, __LINE__); + fprintf(stderr, " If --bootreps > 0, inputs must be" + " files, not pipes.\n"); + exit(EXIT_FAILURE); + } + } + + // Allocate Boot structure + gsl_rng *rng = gsl_rng_alloc(gsl_rng_taus); + gsl_rng_set(rng, (unsigned long) time(NULL)); + boot = Boot_new(nchr, nsnp, bootreps, npat, blocksize, rng); + gsl_rng_free(rng); + CHECKMEM(boot); + } + + unsigned long nsites = 0, nbadaa = 0, nfixed = 0; + long snpndx = -1; + + // Iterate through daf files + fprintf(stderr, "Doing %s pass through data to tabulate patterns..\n", + bootreps > 0 ? "2nd" : "single"); + int chrndx = -1, currChr = INT_MAX; + DAFReader_clearChromosomes(n, r); + done=0; + while(!done) { + status = DAFReader_multiNext(n, r); + switch(status) { + case 0: + ++nsites; + break; + case EOF: + done=1; + continue; + case ALLELE_MISMATCH: + case NO_ANCESTRAL_ALLELE: + ++nbadaa; + ++nsites; + continue; + default: + // something wrong + mystrerror_r(status, errbuff, sizeof errbuff); + fprintf(stderr,"%s:%d: input error (%s)\n", + __FILE__,__LINE__, errbuff); + exit(EXIT_FAILURE); + } + + if(bootreps > 0) { + // chrndx is index of current chromosome + errno = 0; + chrndx = StrInt_get(strint, DAFReader_chr(r[0])); + if(errno) { + fprintf(stderr, + "%s:%d: ERR: missing index for chromosome: %s\n", + __FILE__, __LINE__, DAFReader_chr(r[0])); + exit(EXIT_FAILURE); + } + if(chrndx != currChr) { + currChr = chrndx; + snpndx = 0; + } else + ++snpndx; + +#ifndef NDEBUG + assert(snpndx < nsnp[chrndx]); +#endif + } + // p and q are frequencies of derived and ancestral alleles + double p[n], q[n]; + for(j = 0; j < n; ++j) { + p[j] = DAFReader_daf(r[j]); // derived allele freq + q[j] = 1.0 - p[j]; + } + + if(logAll) { + fprintf(logfile, "%5s %10lu\n", DAFReader_chr(r[0]), + DAFReader_nucpos(r[0])); + } + // Contribution of current snp to each site pattern. Inner + // loop considers each bit in current pattern. If that bit is + // on, multiply z by the derived allele frequency, p. If + // that bit is off, multiply by q=1-p. In the end, z is Prod + // p[j]^bit[j] * q[j]^(1-bit[j]) where bit[j] is the value (0 + // or 1) of the j'th bit. + for(i = 0; i < npat; ++i) { + tipId_t pattern = pat[i]; + double z = 1.0; + for(j = 0; j < n; ++j) { + if(pattern & 1u) + z *= p[j]; + else + z *= q[j]; + pattern >>= 1u; + } + if(!isfinite(z)) { + fprintf(stderr, "%s:%d nonfinite z=%lf\n", + __FILE__, __LINE__, z); + fprintf(stderr, " pattern=%d\n", pat[i]); + for(j = 0; j < n; ++j) + fprintf(stderr, " %d: p=%lf q=%lf\n", j, p[j], q[j]); + } + assert(0 == (pattern & 1)); + patCount[i] += z; + if(bootreps > 0) { + assert(snpndx >= 0); + assert(chrndx >= 0); + Boot_add(boot, chrndx, snpndx, i, z); + } + } +#ifndef NDEBUG + if(bootreps > 0) + Boot_sanityCheck(boot, __FILE__, __LINE__); +#endif + errno = 0; + } + printf("# Sites aligned across all populations: %lu\n", nsites); + if(nbadaa) + printf("# Disagreements about alleles : %lu\n", nbadaa); + if(nfixed) + printf("# Monomorphic sites : %lu\n", nfixed); + printf("# Sites used : %lu\n", + nsites - nbadaa - nfixed); + + // boottab[i][j] is the count of the j'th site pattern + // in the i'th bootstrap replicate. + double bootvals[bootreps]; + double boottab[bootreps][npat]; + memset(boottab, 0, sizeof boottab); + + if(bootreps > 0) { + printf("# %s = %s\n", "bootstrap output file", bootfname); + printf("# %s = %4.2lf%%\n", "confidence level", 100 * conf); +#ifndef NDEBUG + Boot_sanityCheck(boot, __FILE__, __LINE__); +#endif + // put site pattern counts into matrix boottab. + for(i = 0; i < bootreps; ++i) + Boot_aggregate(boot, i, npat, boottab[i]); + + // write an output file for each bootstrap replicate + for(j = 0; j < bootreps; ++j) { + char buff[FILENAMESIZE + 3]; + status = snprintf(buff, sizeof buff, "%s%03d", bootfname, j); + if(status >= sizeof buff) + DIE("buffer overflow in snprintf"); + + FILE *fp = fopen(buff, "w"); + if(fp == NULL) + DIE("bad fopen"); + fprintf(fp, "# %13s %20s", "SitePat", "E[count]\n"); + for(i = 0; i < npat; ++i) { + fprintf(fp, "%15s %20.7lf\n", + patLbl(lblsize, lblbuff, pat[i], &lndx), + boottab[j][i]); + } + fclose(fp); + } + } + // print labels and binary representation of site patterns + printf("# %13s %20s", "SitePat", "E[count]"); + if(bootreps > 0) + printf(" %15s %15s", "loBnd", "hiBnd"); + putchar('\n'); + for(i = 0; i < npat; ++i) { + printf("%15s %20.7lf", + patLbl(lblsize, lblbuff, pat[i], &lndx), patCount[i]); + if(bootreps > 0) { + double lowBnd, highBnd; + for(j = 0; j < bootreps; ++j) + bootvals[j] = boottab[j][i]; + confidenceBounds(&lowBnd, &highBnd, conf, bootreps, bootvals); + printf(" %15.7lf %15.7lf", lowBnd, highBnd); + } + putchar('\n'); + } + + for(i = 0; i < n; ++i) + DAFReader_free(r[i]); + if(bootreps > 0) + Boot_free(boot); + StrInt_free(strint); + if(logfile) + fclose(logfile); + fprintf(stderr, "scrmpat is finished\n"); + return 0; +} From ee3f8a6c2891c488219eaabf57aaa16a56b3906a Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Fri, 5 Jan 2018 17:46:15 -0700 Subject: [PATCH 002/101] Add scrmreader.* --- src/scrmreader.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++ src/scrmreader.h | 30 ++++++++++ src/try.c | 12 +--- src/typedefs.h | 1 + 4 files changed, 185 insertions(+), 10 deletions(-) create mode 100644 src/scrmreader.c create mode 100644 src/scrmreader.h diff --git a/src/scrmreader.c b/src/scrmreader.c new file mode 100644 index 00000000..c8fcc9dc --- /dev/null +++ b/src/scrmreader.c @@ -0,0 +1,152 @@ +/** +@file scrmreader.c +@page scrmreader +@brief Interface to scrm output files. +*/ + +#include "scrmreader.h" +#include "misc.h" +#include +#include +#include + +struct ScrmReader { + int npops; + int *nsamples; + double *daf; + + // Independent replicates in scrm output appear as separate + // chromosomes, which are labelled by unsigned integers. + unsigned chr; + + // Position values in scrm output are ignored. Instead, ScrmReader + // returns positions as a sequence of unsigned integers. + unsigned long nucpos; + FILE *fp; +}; + +// Allocate and initialize a new ScrmReader from input stream. +ScrmReader *ScrmReader_new(FILE *fp) { + // buffer is large, because scrm command lines can be long + char buff[8192]; + errno = 0; + if(fgets(buff, sizeof buff, fp) == NULL) + return NULL; + + if(!strchr(buff, '\n') && !feof(fp)) { + fprintf(stderr, "%s:%d: buffer overflow. buff size: %zu\n", + __FILE__, __LINE__, sizeof buff); + fprintf(stderr,"input: %s\n", buff); + return NULL; + } + + ScrmReader *self = malloc(sizeof ScrmReader); + CHECKMEM(self); + memset(self, 0, sizeof ScrmReader); + self->fp = fp; + + Tokenizer *tkz = Tokenizer_new((sizeof buff)/2); + Tokenizer_split(tkz, buff, " "); + int ntokens = Tokenizer_strip(tkz, " \n"); + + if(strcmp("scrm", Tokenizer_token(tkz, 0)) != 0) { + fprintf(stderr,"%s:%d: input file is not scrm output\n", + __FILE__,__LINE__); + free(self); + exit(EXIT_FAILURE); + } + + int i, j, npops; + long h; + char *token, *end; + + // + for(i=1; i < ntokens; ++i) { + token = Tokenizer_token(tkz, i); + if(strcmp("-I", token) == 0) { + token = Tokenizer_token(tkz, i+1); + if(self->npops == 0) { + // set npops and allocate nsamples + self->npops = strtol(token, NULL, 10); + self->nsamples = malloc(self->npops * sizeof(int)); + CHECKMEM(self->nsamples); + for(j=0; j < self->npops; ++j) { + token = Tokenizer_token(tkz, i+2+j); + self->nsamples[j] = strtol(token, NULL, 10); + } + }else{ + // check for consistency and increment nsamples + npops = strtol(token, NULL, 10); + if(npops != self->npops) { + fprintf(stderr,"%s:%d: inconsistent population count:" + " %d != %d\n", + __FILE__,__LINE__,npops, self->npops); + exit(EXIT_FAILURE); + } + for(j=0; j < self->npops; ++j) { + token = Tokenizer_token(tkz, i+2+j); + self->nsamples[j] += strtol(token, NULL, 10); + } + } + i += self->npops; + }else if(strcmp("-eI", token) == 0) { + if(self->npops == 0) { + // count populations and allocate nsamples + for(j=i+2; jnpops; + } + assert(self->nsamples == NULL); + if(self->npops == 0) { + fprintf(stderr,"%s:%d: npops is zero\n", + __FILE__,__LINE__); + exit(EXIT_FAILURE); + } + self->nsamples = malloc(self->npops * sizeof(int)); + for(j=0; jnpops; ++j) { + token = Tokenizer_token(tkz, i+2+j); + h = strtol(token, &end, 10); + assert(end != token); + self->nsamples[j] = h; + } + }else{ + // increment nsamples + assert(self->npops > 0); + assert(self->nsamples == NULL); + for(j=0; jnpops; ++j) { + token = Tokenizer_token(tkz, i+2+j); + h = strtol(token, &end, 10); + assert(end != token); + self->nsamples[j] += h; + } + } + } + } + +} + +// Rewind input and reset chr and nucpos. Doesn't work +// if input is stdin. +int ScrmReader_rewind(ScrmReader *self); + +// Move ScrmReader to next nucleotide site. +int ScrmReader_next(ScrmReader *self); + +// Return current chromosome. +unsigned ScrmReader_chr(ScrmReader *self); + +// Return current nucleotide position. +unsigned long ScrmReader_nucpos(ScrmReader *self); + +// Return number of populations. +int ScrmReader_npops(ScrmReader *self); + +// Return number of samples from population i. +int ScrmReader_nsamples(ScrmReader *self, int i); + +// Return frequency of derived allele in sample from population i. +double ScrmReader_daf(ScrmReader *self, int i); diff --git a/src/scrmreader.h b/src/scrmreader.h new file mode 100644 index 00000000..89616395 --- /dev/null +++ b/src/scrmreader.h @@ -0,0 +1,30 @@ +#ifndef SCRMREADER_H +#define SCRMREADER_H + +#include "typedefs.h" + +// constructor +ScrmReader *ScrmReader_new(FILE *fp); + +// Rewind input and reset chr and nucpos. Doesn't work +// if input is stdin. +int ScrmReader_rewind(ScrmReader *self); + +// Move ScrmReader to next nucleotide site. +int ScrmReader_next(ScrmReader *self); + +// Return current chromosome. +unsigned ScrmReader_chr(ScrmReader *self); + +// Return current nucleotide position. +unsigned long ScrmReader_nucpos(ScrmReader *self); + +// Return number of populations. +int ScrmReader_npops(ScrmReader *self); + +// Return number of samples from population i. +int ScrmReader_nsamples(ScrmReader *self, int i); + +// Return frequency of derived allele in sample from population i. +double ScrmReader_daf(ScrmReader *self, int i); +#endif diff --git a/src/try.c b/src/try.c index 680fd69c..ff1a6127 100644 --- a/src/try.c +++ b/src/try.c @@ -1,15 +1,7 @@ #include -#include - +#include int main(void) { - int i = INT_MAX; - - printf("max=%d = %e\n", i, (double) i); - - i += 1; - - printf("max+1=%d\n", i); - + printf("%lf\n", 1e10*DBL_EPSILON); return 0; } diff --git a/src/typedefs.h b/src/typedefs.h index 46dccc0c..a02f6fba 100644 --- a/src/typedefs.h +++ b/src/typedefs.h @@ -23,6 +23,7 @@ typedef struct ParKeyVal ParKeyVal; typedef struct ParStore ParStore; typedef struct PopNode PopNode; typedef struct PopNodeTab PopNodeTab; +typedef struct ScrmReader ScrmReader; typedef struct SimSched SimSched; typedef struct SampNdx SampNdx; typedef struct StrInt StrInt; From 4df7daac9d35ba33432b8d578cf7c84b6e2cad92 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Fri, 5 Jan 2018 19:22:18 -0700 Subject: [PATCH 003/101] . --- src/scrmreader.c | 150 ++++++++++++++++++++++++++--------------------- 1 file changed, 83 insertions(+), 67 deletions(-) diff --git a/src/scrmreader.c b/src/scrmreader.c index c8fcc9dc..1f44f987 100644 --- a/src/scrmreader.c +++ b/src/scrmreader.c @@ -25,72 +25,67 @@ struct ScrmReader { FILE *fp; }; -// Allocate and initialize a new ScrmReader from input stream. -ScrmReader *ScrmReader_new(FILE *fp) { - // buffer is large, because scrm command lines can be long - char buff[8192]; - errno = 0; - if(fgets(buff, sizeof buff, fp) == NULL) - return NULL; - - if(!strchr(buff, '\n') && !feof(fp)) { - fprintf(stderr, "%s:%d: buffer overflow. buff size: %zu\n", - __FILE__, __LINE__, sizeof buff); - fprintf(stderr,"input: %s\n", buff); - return NULL; - } - - ScrmReader *self = malloc(sizeof ScrmReader); - CHECKMEM(self); - memset(self, 0, sizeof ScrmReader); - self->fp = fp; - - Tokenizer *tkz = Tokenizer_new((sizeof buff)/2); +int *countSamples(size_t bsize, char buff[bsize], int *npops); + +// On input, buff should contain a string representing the scrm command line, +// and nsamples should point to an int. +// +// The function returns a newly-allocated array of ints, whose dimension +// is *npops, the number of populations specified on the scrm command line. +// The i'th entry in this array is the haploid sample size of population i. +// +// On error, the function returns NULL. +int *countSamples(size_t bsize, char buff[bsize], int *npops) { + Tokenizer *tkz = Tokenizer_new(bsize/2); Tokenizer_split(tkz, buff, " "); int ntokens = Tokenizer_strip(tkz, " \n"); if(strcmp("scrm", Tokenizer_token(tkz, 0)) != 0) { fprintf(stderr,"%s:%d: input file is not scrm output\n", __FILE__,__LINE__); - free(self); - exit(EXIT_FAILURE); + return NULL; } - int i, j, npops; + int i, j, npops=0; long h; char *token, *end; + int *nsamples = NULL; - // + // Read through tokens, looking for -I and -eI. Use these arguments + // to set npops and nsamples. for(i=1; i < ntokens; ++i) { token = Tokenizer_token(tkz, i); if(strcmp("-I", token) == 0) { token = Tokenizer_token(tkz, i+1); - if(self->npops == 0) { + if(npops == 0) { // set npops and allocate nsamples - self->npops = strtol(token, NULL, 10); - self->nsamples = malloc(self->npops * sizeof(int)); - CHECKMEM(self->nsamples); - for(j=0; j < self->npops; ++j) { - token = Tokenizer_token(tkz, i+2+j); - self->nsamples[j] = strtol(token, NULL, 10); - } - }else{ - // check for consistency and increment nsamples npops = strtol(token, NULL, 10); - if(npops != self->npops) { - fprintf(stderr,"%s:%d: inconsistent population count:" + nsamples = malloc(npops * sizeof(nsamples[0])); + CHECKMEM(nsamples); + memset(nsamples, 0, npops * sizeof(nsamples[0])); + }else{ + // check for consistency + int npops2 = strtol(token, NULL, 10); + if(npops != npops2) { + fprintf(stderr,"%s:%d: ERR: inconsistent population count:" " %d != %d\n", - __FILE__,__LINE__,npops, self->npops); - exit(EXIT_FAILURE); - } - for(j=0; j < self->npops; ++j) { - token = Tokenizer_token(tkz, i+2+j); - self->nsamples[j] += strtol(token, NULL, 10); + __FILE__,__LINE__,npops, npops2); + if(nsamples==NULL) + fprintf("%s:%d: ERR: nsamples==NULL.\n", + __FILE__,__LINE__); + else + free(nsamples); + return NULL; } } - i += self->npops; + // increment samples + for(j=0; j < npops; ++j) { + token = Tokenizer_token(tkz, i+2+j); + nsamples[j] += strtol(token, NULL, 10); + } + i += npops; }else if(strcmp("-eI", token) == 0) { - if(self->npops == 0) { + if(npops == 0) { // count populations and allocate nsamples for(j=i+2; jnpops; + ++npops; } - assert(self->nsamples == NULL); - if(self->npops == 0) { + assert(nsamples == NULL); + if(npops == 0) { fprintf(stderr,"%s:%d: npops is zero\n", __FILE__,__LINE__); - exit(EXIT_FAILURE); - } - self->nsamples = malloc(self->npops * sizeof(int)); - for(j=0; jnpops; ++j) { - token = Tokenizer_token(tkz, i+2+j); - h = strtol(token, &end, 10); - assert(end != token); - self->nsamples[j] = h; - } - }else{ - // increment nsamples - assert(self->npops > 0); - assert(self->nsamples == NULL); - for(j=0; jnpops; ++j) { - token = Tokenizer_token(tkz, i+2+j); - h = strtol(token, &end, 10); - assert(end != token); - self->nsamples[j] += h; + return NULL; } + nsamples = malloc(npops * sizeof(nsamples[0])); + CHECKMEM(nsamples); + memset(nsamples, 0, npops * sizeof(nsamples[0])); + } + // increment nsamples + assert(npops > 0); + assert(*nsamples != NULL); + for(j=0; jnsamples[j] += h; } + i += npops; } } - + return nsamples; +} + +// Allocate and initialize a new ScrmReader from input stream. +ScrmReader *ScrmReader_new(FILE *fp) { + // buffer is large, because scrm command lines can be long + char buff[8192]; + errno = 0; + if(fgets(buff, sizeof buff, fp) == NULL) + return NULL; + + if(!strchr(buff, '\n') && !feof(fp)) { + fprintf(stderr, "%s:%d: buffer overflow. buff size: %zu\n", + __FILE__, __LINE__, sizeof buff); + fprintf(stderr,"input: %s\n", buff); + return NULL; + } + + ScrmReader *self = malloc(sizeof ScrmReader); + CHECKMEM(self); + memset(self, 0, sizeof ScrmReader); + self->fp = fp; + + self->npops = countSamples(sizeof buff, buff, &self->nsamples); + + return npops; } // Rewind input and reset chr and nucpos. Doesn't work From 997c76b8ad822b6ec883281cd47edcef678ffcd8 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sat, 6 Jan 2018 16:02:59 -0700 Subject: [PATCH 004/101] 1st draft of scrmreader.c --- src/misc.c | 53 ++++++++++ src/misc.h | 2 + src/scrmreader.c | 251 +++++++++++++++++++++++++++++++++-------------- src/scrmreader.h | 4 + test/output.scrm | 24 +++++ test/xmisc.c | 38 +++++++ 6 files changed, 301 insertions(+), 71 deletions(-) create mode 100644 test/output.scrm diff --git a/src/misc.c b/src/misc.c index deb2ff34..ab1a9019 100644 --- a/src/misc.c +++ b/src/misc.c @@ -10,6 +10,7 @@ #include "binary.h" #include "lblndx.h" #include "version.h" +#include "error.h" #include #include #include @@ -611,3 +612,55 @@ char *strcenter(const char *text, unsigned width, buff[lpad + txtwid + rpad] = '\0'; return buff; } + +/** + * Remove zeroes from an array of unsigned ints by sliding positive + * entries to the left. Return number of non-zero entries. This + * function doesn't re-allocate the array. It just moves zeroes to the + * end. + */ +int removeZeroes(int dim, unsigned x[dim]) { + int i, j; + i=j=0; + while(i < dim) { + if(x[j] > 0) { + // ++ or 0+ + ++i; + ++j; + }else if(x[i]==0 && x[j]==0) { + // 00 + ++i; + }else if(x[i] > 0 && x[j] == 0) { + // +0 + assert(i > j); + x[j] = x[i]; + x[i] = 0; + ++i; + ++j; + } + } + while(j0) + ++j; + return j; +} + +/// Read a line of input into buff. +/// Return EOF or BUFFER_OVERFLOW on failure; 0 on success. +int readline(int dim, char buff[dim], FILE *fp) { + if(fgets(buff, dim, fp) == NULL) + return EOF; + + if(NULL == strchr(buff, '\n')) { + if(feof(fp)) + return 0; + int c = fgetc(fp); + if(c == EOF) + return 0; + else { + ungetc(c, fp); + return BUFFER_OVERFLOW; + } + } + + return 0; +} diff --git a/src/misc.h b/src/misc.h index 7e61d941..74e1728e 100644 --- a/src/misc.h +++ b/src/misc.h @@ -20,6 +20,7 @@ double perturb_ratio_w(double x, double w, gsl_rng * rng); double perturb_ratio(double x, gsl_rng * rng); long double perturb_interval(long double x, long double lo, long double hi, gsl_rng * rng); +int removeZeroes(int dim, unsigned x[dim]); void eprintf(const char *fmt, ...); FILE *efopen(const char *restrict name, const char *restrict mode); void printBranchTab(double tab[3][3], FILE * fp); @@ -45,6 +46,7 @@ char *strltrunc(char *s, int n); void hdr(const char *msg); char *strcenter(const char *text, unsigned width, char *buff, size_t buffsize); +int readline(int dim, char buff[dim], FILE *fp); static inline double survival(double t, double twoN); diff --git a/src/scrmreader.c b/src/scrmreader.c index 1f44f987..575ef9bb 100644 --- a/src/scrmreader.c +++ b/src/scrmreader.c @@ -6,13 +6,15 @@ #include "scrmreader.h" #include "misc.h" +#include "tokenizer.h" +#include "error.h" #include #include #include struct ScrmReader { int npops; - int *nsamples; + unsigned *nsamples; double *daf; // Independent replicates in scrm output appear as separate @@ -22,23 +24,31 @@ struct ScrmReader { // Position values in scrm output are ignored. Instead, ScrmReader // returns positions as a sequence of unsigned integers. unsigned long nucpos; + Tokenizer *tkz; FILE *fp; }; -int *countSamples(size_t bsize, char buff[bsize], int *npops); +unsigned *countSamples(Tokenizer *tkz, int *npops); +int readuntil(int n, const char str[n], int dim, char buff[dim], FILE *fp); -// On input, buff should contain a string representing the scrm command line, -// and nsamples should point to an int. +// destructor +void ScrmReader_free(ScrmReader *self) { + assert(self); + free(self->nsamples); + free(self->daf); + Tokenizer_free(self->tkz); + free(self); +} + +// On input, tkz should point to a tokenized string representing the +// scrm command line, and nsamples should point to an int. // // The function returns a newly-allocated array of ints, whose dimension // is *npops, the number of populations specified on the scrm command line. // The i'th entry in this array is the haploid sample size of population i. // // On error, the function returns NULL. -int *countSamples(size_t bsize, char buff[bsize], int *npops) { - Tokenizer *tkz = Tokenizer_new(bsize/2); - Tokenizer_split(tkz, buff, " "); - int ntokens = Tokenizer_strip(tkz, " \n"); +unsigned *countSamples(Tokenizer *tkz, int *npops) { if(strcmp("scrm", Tokenizer_token(tkz, 0)) != 0) { fprintf(stderr,"%s:%d: input file is not scrm output\n", @@ -46,123 +56,222 @@ int *countSamples(size_t bsize, char buff[bsize], int *npops) { return NULL; } - int i, j, npops=0; - long h; + int i, j; + long unsigned h; char *token, *end; - int *nsamples = NULL; + unsigned *nsamples = NULL; + int ntokens = Tokenizer_ntokens(tkz); + + *npops=0; // Read through tokens, looking for -I and -eI. Use these arguments // to set npops and nsamples. for(i=1; i < ntokens; ++i) { token = Tokenizer_token(tkz, i); - if(strcmp("-I", token) == 0) { - token = Tokenizer_token(tkz, i+1); - if(npops == 0) { - // set npops and allocate nsamples - npops = strtol(token, NULL, 10); - nsamples = malloc(npops * sizeof(nsamples[0])); - CHECKMEM(nsamples); - memset(nsamples, 0, npops * sizeof(nsamples[0])); - }else{ - // check for consistency - int npops2 = strtol(token, NULL, 10); - if(npops != npops2) { - fprintf(stderr,"%s:%d: ERR: inconsistent population count:" - " %d != %d\n", - __FILE__,__LINE__,npops, npops2); - if(nsamples==NULL) - fprintf("%s:%d: ERR: nsamples==NULL.\n", - __FILE__,__LINE__); - else - free(nsamples); - return NULL; - } - } - // increment samples - for(j=0; j < npops; ++j) { - token = Tokenizer_token(tkz, i+2+j); - nsamples[j] += strtol(token, NULL, 10); - } - i += npops; - }else if(strcmp("-eI", token) == 0) { - if(npops == 0) { + if(strcmp("-I", token) == 0 || strcmp("-eI", token) == 0) { + if(*npops == 0) { // count populations and allocate nsamples + assert(nsamples == NULL); for(j=i+2; j 0); + assert(*npops > 0); assert(*nsamples != NULL); - for(j=0; jnsamples[j] += h; + nsamples[j] += h; } - i += npops; + // advance to last argument of -I or -eI + i += 1 + *npops; } } + // Remove populations with zero samples + *npops = removeZeroes(*npops, nsamples); return nsamples; } // Allocate and initialize a new ScrmReader from input stream. ScrmReader *ScrmReader_new(FILE *fp) { + // buffer is large, because scrm command lines can be long char buff[8192]; - errno = 0; - if(fgets(buff, sizeof buff, fp) == NULL) - return NULL; + int status; - if(!strchr(buff, '\n') && !feof(fp)) { - fprintf(stderr, "%s:%d: buffer overflow. buff size: %zu\n", - __FILE__, __LINE__, sizeof buff); - fprintf(stderr,"input: %s\n", buff); + status = readline(sizeof(buff), buff, fp); + switch(status) { + case 0: + break; + case EOF: + fprintf(stderr,"%s:%d: unexpected EOF\n", __FILE__,__LINE__); + return NULL; + case BUFFER_OVERFLOW: + fprintf(stderr,"%s:%d: buffer overflow reading scrm command\n", + __FILE__,__LINE__); + return NULL; + default: + fprintf(stderr,"%s:%d: unknown error\n", __FILE__,__LINE__); return NULL; } - ScrmReader *self = malloc(sizeof ScrmReader); + ScrmReader *self = malloc(sizeof(ScrmReader)); CHECKMEM(self); - memset(self, 0, sizeof ScrmReader); + memset(self, 0, sizeof(ScrmReader)); self->fp = fp; - self->npops = countSamples(sizeof buff, buff, &self->nsamples); + self->tkz = Tokenizer_new(sizeof(buff)/2); + Tokenizer_split(self->tkz, buff, " "); + Tokenizer_strip(self->tkz, " \n"); - return npops; + self->nsamples = countSamples(self->tkz, &self->npops); + + // read to line beginning with "position" + status = readuntil(strlen("position"), "position", sizeof(buff), buff, fp); + if(status) { + free(self->nsamples); + ScrmReader_free(self); + return NULL; + } + + // read 1st line of data + status = ScrmReader_next(self); + if(status) { + free(self->nsamples); + ScrmReader_free(self); + return NULL; + } + self->chr = self->nucpos = 0; + self->daf = malloc(self->npops * sizeof(self->daf[0])); + CHECKMEM(self->daf); + return self; +} + +/// Read lines until we reach one that begins with str. +/// Return 0 on success, EOF on failure. +int readuntil(int n, const char str[n], int dim, char buff[dim], FILE *fp) { + int status; + do{ + status = readline(dim, buff, fp); + if(status) + return status; + }while(0 != strncmp(buff, str, n)); + return 0; } // Rewind input and reset chr and nucpos. Doesn't work // if input is stdin. -int ScrmReader_rewind(ScrmReader *self); +int ScrmReader_rewind(ScrmReader *self) { + int status; + char buff[8192]; + assert(self->fp != stdin); + errno = 0; + rewind(self->fp); + if(errno) + return errno; + // read to line beginning with "position" + status = readuntil(strlen("position"), "position", sizeof(buff), buff, + self->fp); + if(status) + return status; + + status = ScrmReader_next(self); + if(status) + return status; + self->chr = self->nucpos = 0; + return 0; +} // Move ScrmReader to next nucleotide site. -int ScrmReader_next(ScrmReader *self); +int ScrmReader_next(ScrmReader *self) { + char buff[8192]; + int status, ntokens; + status = readline(sizeof(buff), buff, self->fp); + if(status) + return status; + if(strlen(buff) == 0) { + // new chromosome + status = readuntil(strlen("position"), "position", sizeof(buff), buff, + self->fp); + if(status) + return status; + + status = readline(sizeof(buff), buff, self->fp); + if(status) + return status; + ++self->chr; + self->nucpos = 0; + }else + ++self->nucpos; + Tokenizer_split(self->tkz, buff, " "); + ntokens = Tokenizer_strip(self->tkz, " \n"); + + // calculate derived allele frequency w/i each pop + double nderived; + int pop, i; + int start=2; // skip 1st two columns + char *token, *end; + for(pop=0; pop < self->npops; ++pop) { + nderived = 0.0; + for(i=start; i < start + self->nsamples[pop]; ++i) { + if(i >= Tokenizer_ntokens(self->tkz)) { + fprintf(stderr,"%s:%d: too few genotypes in scrm output\n", + __FILE__,__LINE__); + return EDOM; + } + token = Tokenizer_token(self->tkz, i); + unsigned gtype = strtoul(token, &end, 10); + if(token==end || (gtype!=0 && gtype!=1)) { + fprintf(stderr,"%s:%d: illegal genotype: %s\n", + __FILE__,__LINE__, token); + return EDOM; + } + nderived += gtype; + } + self->daf[pop] = nderived / self->nsamples[pop]; + start += self->nsamples[pop]; + } + return 0; +} // Return current chromosome. -unsigned ScrmReader_chr(ScrmReader *self); +unsigned ScrmReader_chr(ScrmReader *self) { + return self->chr; +} // Return current nucleotide position. -unsigned long ScrmReader_nucpos(ScrmReader *self); +unsigned long ScrmReader_nucpos(ScrmReader *self) { + return self->nucpos; +} // Return number of populations. -int ScrmReader_npops(ScrmReader *self); +int ScrmReader_npops(ScrmReader *self) { + return self->npops; +} // Return number of samples from population i. -int ScrmReader_nsamples(ScrmReader *self, int i); +int ScrmReader_nsamples(ScrmReader *self, int i) { + assert(i < self->npops); + return self->nsamples[i]; +} // Return frequency of derived allele in sample from population i. -double ScrmReader_daf(ScrmReader *self, int i); +double ScrmReader_daf(ScrmReader *self, int i) { + assert(i < self->npops); + return self->daf[i]; +} diff --git a/src/scrmreader.h b/src/scrmreader.h index 89616395..3d73eda3 100644 --- a/src/scrmreader.h +++ b/src/scrmreader.h @@ -2,10 +2,14 @@ #define SCRMREADER_H #include "typedefs.h" +#include // constructor ScrmReader *ScrmReader_new(FILE *fp); +// destructor +void ScrmReader_free(ScrmReader *self); + // Rewind input and reset chr and nucpos. Doesn't work // if input is stdin. int ScrmReader_rewind(ScrmReader *self); diff --git a/test/output.scrm b/test/output.scrm new file mode 100644 index 00000000..08097de9 --- /dev/null +++ b/test/output.scrm @@ -0,0 +1,24 @@ +scrm 18 2 -l 100r -t 1.35351 -r 0.966782 1000 -transpose-segsites -SC abs -I 5 6 6 0 0 0 -eI 0.0192475 0 0 2 0 0 -eI 0.00561032 0 0 2 0 0 -eI 0.0117678 0 0 0 2 0 -n 1 2.0687 -n 2 1.12947 -n 3 0.239208 -n 4 0.239208 -n 5 0.239208 -en 0.0391811 1 1.04935 -en 0.268103 1 1 -en 0.238251 3 0.143789 -eg 0 2 0.01 -en 0.00413739 2 0.020687 -en 0.00930914 2 2.0687 -en 0.0174805 2 0.20687 -en 0.0178322 2 1.04935 -eg 0 1 0.005 -en 0.00413739 1 0.20687 -en 0.00930914 1 1.04935 -ej 0.771206 5 1 -ej 0.268103 3 1 -ej 0.0391811 2 1 -ej 0.238251 4 1 -eps 0.0196216 2 3 0.987756 -eps 0.2379 4 5 0.969232 +3698475291 + +// +transposed segsites: 8 +position time 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 +28.4673 0.285182 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 +154.078 0.630603 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 +310.596 2.02541 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 +322.935 1.63232 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 +328.764 0.0838485 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 +605.027 0.137518 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 +693.162 0.312203 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 +968.775 0.320993 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + +// +transposed segsites: 6 +position time 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 +170.254 0.285182 1 0 0 1 1 0 1 1 1 1 1 0 0 0 0 0 0 0 +199.295 0.630603 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 +200.906 2.02541 1 0 0 1 1 0 1 1 1 1 1 0 0 0 0 0 0 0 +278.036 1.63232 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1 1 0 0 +522.811 0.0838485 1 0 0 1 1 0 1 1 1 1 1 0 0 0 0 0 0 0 +927.6 0.137518 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 diff --git a/test/xmisc.c b/test/xmisc.c index 1370c09f..b0d8b2d3 100644 --- a/test/xmisc.c +++ b/test/xmisc.c @@ -8,6 +8,7 @@ */ #include "misc.h" +#include "error.h" #include #include #include @@ -59,6 +60,7 @@ int main(int argc, char **argv) { // This should abort. fp = efopen("NotThere", "r"); #endif + fclose(fp); unitTstResult("efopen", "OK"); @@ -141,5 +143,41 @@ int main(int argc, char **argv) { unitTstResult("strltrunc", "OK"); + unsigned y[] = {0, 0, 1, 0, 0, 0, 2, 0, 0, 3, 0, 0}; + dim = (int) (sizeof(y)/sizeof(y[0])); + dim = removeZeroes(dim, y); + assert(dim == 3); + for(i=0; i < dim; ++i) { + if(verbose) + printf("%u ", y[i]); + assert(y[i] == i+1u); + } + if(verbose) + putchar('\n'); + unitTstResult("removeZeroes", "OK"); + + fp = fopen("xmisc.tmp", "w"); + assert(fp); + fputs("123456789", fp); + fclose(fp); + + int status; + fp=fopen("xmisc.tmp", "r"); + status = readline(5, buff, fp); + assert(status == BUFFER_OVERFLOW); + rewind(fp); + + status = readline(9, buff, fp); + assert(status == BUFFER_OVERFLOW); + rewind(fp); + + status = readline(10, buff, fp); + assert(status == 0); + + status = readline(10, buff, fp); + assert(status == EOF); + + unitTstResult("readline", "OK"); + return 0; } From 18bed61ceae81370337af48f164be51482a91051 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sun, 7 Jan 2018 10:30:42 -0700 Subject: [PATCH 005/101] scrmreader.c passes unit test. --- src/scrmreader.c | 188 ++++++++++++++++++++++++++++++++++++++++++++--- test/Makefile | 10 ++- test/output.scrm | 14 +--- 3 files changed, 190 insertions(+), 22 deletions(-) diff --git a/src/scrmreader.c b/src/scrmreader.c index 575ef9bb..08918cfc 100644 --- a/src/scrmreader.c +++ b/src/scrmreader.c @@ -29,7 +29,7 @@ struct ScrmReader { }; unsigned *countSamples(Tokenizer *tkz, int *npops); -int readuntil(int n, const char str[n], int dim, char buff[dim], FILE *fp); +int readuntil(int n, const char *str, int dim, char buff[dim], FILE *fp); // destructor void ScrmReader_free(ScrmReader *self) { @@ -91,7 +91,7 @@ unsigned *countSamples(Tokenizer *tkz, int *npops) { } // increment nsamples assert(*npops > 0); - assert(*nsamples != NULL); + assert(nsamples != NULL); for(j=0; j < *npops; ++j) { token = Tokenizer_token(tkz, i+2+j); h = strtoul(token, &end, 10); @@ -112,7 +112,7 @@ ScrmReader *ScrmReader_new(FILE *fp) { // buffer is large, because scrm command lines can be long char buff[8192]; - int status; + int i, status; status = readline(sizeof(buff), buff, fp); switch(status) { @@ -141,6 +141,19 @@ ScrmReader *ScrmReader_new(FILE *fp) { self->nsamples = countSamples(self->tkz, &self->npops); + unsigned tot = 0; + for(i=0; i < self->npops; ++i) { + tot += self->nsamples[i]; + } + unsigned tot2 = strtoul(Tokenizer_token(self->tkz, 1), NULL, 10); + if(tot != tot2) { + fprintf(stderr,"%s:%d: incorrect sample count: %u; should be %u\n", + __FILE__,__LINE__, tot, tot2); + free(self->nsamples); + ScrmReader_free(self); + return NULL; + } + // read to line beginning with "position" status = readuntil(strlen("position"), "position", sizeof(buff), buff, fp); if(status) { @@ -149,22 +162,25 @@ ScrmReader *ScrmReader_new(FILE *fp) { return NULL; } + // allocate daf array + self->daf = malloc(self->npops * sizeof(self->daf[0])); + CHECKMEM(self->daf); + // read 1st line of data status = ScrmReader_next(self); if(status) { free(self->nsamples); + free(self->daf); ScrmReader_free(self); return NULL; } self->chr = self->nucpos = 0; - self->daf = malloc(self->npops * sizeof(self->daf[0])); - CHECKMEM(self->daf); return self; } /// Read lines until we reach one that begins with str. /// Return 0 on success, EOF on failure. -int readuntil(int n, const char str[n], int dim, char buff[dim], FILE *fp) { +int readuntil(int n, const char *str, int dim, char buff[dim], FILE *fp) { int status; do{ status = readline(dim, buff, fp); @@ -204,7 +220,7 @@ int ScrmReader_next(ScrmReader *self) { status = readline(sizeof(buff), buff, self->fp); if(status) return status; - if(strlen(buff) == 0) { + if(strlen(stripWhiteSpace(buff)) == 0) { // new chromosome status = readuntil(strlen("position"), "position", sizeof(buff), buff, self->fp); @@ -230,8 +246,8 @@ int ScrmReader_next(ScrmReader *self) { nderived = 0.0; for(i=start; i < start + self->nsamples[pop]; ++i) { if(i >= Tokenizer_ntokens(self->tkz)) { - fprintf(stderr,"%s:%d: too few genotypes in scrm output\n", - __FILE__,__LINE__); + fprintf(stderr,"%s:%s:%d: too few genotypes in scrm output\n", + __FILE__,__func__,__LINE__); return EDOM; } token = Tokenizer_token(self->tkz, i); @@ -275,3 +291,157 @@ double ScrmReader_daf(ScrmReader *self, int i) { assert(i < self->npops); return self->daf[i]; } + +#ifdef TEST + +# ifdef NDEBUG +# error "Unit tests must be compiled without -DNDEBUG flag" +# endif + +const char *cmd = "scrm 18 2 -l 100r -t 1.35351 -r 0.966782 1000" + " -transpose-segsites -SC abs -I 5 6 6 0 0 0 -eI 0.0192475 0 0 2 0 0" + " -eI 0.00561032 0 0 2 0 0 -eI 0.0117678 0 0 0 2 0 -n 1 2.0687" + " -n 2 1.12947 -n 3 0.239208 -n 4 0.239208 -n 5 0.239208" + " -en 0.0391811 1 1.04935 -en 0.268103 1 1 -en 0.238251 3 0.143789" + " -eg 0 2 0.01 -en 0.00413739 2 0.020687 -en 0.00930914 2 2.0687" + " -en 0.0174805 2 0.20687 -en 0.0178322 2 1.04935 -eg 0 1 0.005" + " -en 0.00413739 1 0.20687 -en 0.00930914 1 1.04935 -ej 0.771206 5 1" + " -ej 0.268103 3 1 -ej 0.0391811 2 1 -ej 0.238251 4 1" + " -eps 0.0196216 2 3 0.987756 -eps 0.2379 4 5 0.969232"; + +int main(int argc, char **argv) { + int verbose = 0; + if(argc > 1) { + if(argc != 2 || 0 != strcmp(argv[1], "-v")) { + fprintf(stderr, "usage: xscrmreader [-v]\n"); + exit(EXIT_FAILURE); + } + verbose = 1; + } + int i, npops=0; + char buff[1000]; + unsigned *nsamples; + Tokenizer *tkz = Tokenizer_new(sizeof(buff)/2); + CHECKMEM(tkz); + + strcpy(buff, cmd); + Tokenizer_split(tkz, buff, " "); + Tokenizer_strip(tkz, " \n"); + nsamples = countSamples(tkz, &npops); + if(verbose) { + printf("countSamples returned: npops=%d; nsamples =", npops); + for(i=0; i < npops; ++i) + printf(" %u", nsamples[i]); + putchar('\n'); + } + + unitTstResult("countSamples", "OK"); + + FILE *fp = fopen("output.scrm", "r"); + int status; + assert(fp); + status = readuntil(strlen("position"), "position", sizeof(buff), buff, fp); + if(status) { + fprintf(stderr,"%s:%d: readuntil returned %d\n", + __FILE__,__LINE__, status); + exit(EXIT_FAILURE); + } + if(verbose) { + fprintf(stderr,"%s:%d: readuntil succeeded: %s\n", + __FILE__,__LINE__, buff); + } + unitTstResult("readuntil", "OK"); + + rewind(fp); + ScrmReader *r = ScrmReader_new(fp); + assert(0 == ScrmReader_chr(r)); + assert(0 == ScrmReader_nucpos(r)); + int np = ScrmReader_npops(r); + assert(np == 4); + assert(6 == ScrmReader_nsamples(r, 0)); + assert(6 == ScrmReader_nsamples(r, 1)); + assert(4 == ScrmReader_nsamples(r, 2)); + assert(2 == ScrmReader_nsamples(r, 3)); + // 0 0 0 0 0 0 | 0 0 0 0 0 0 | 0 0 0 0 | 1 1 + assert(0.0 == ScrmReader_daf(r, 0)); + assert(0.0 == ScrmReader_daf(r, 1)); + assert(0.0 == ScrmReader_daf(r, 2)); + assert(1.0 == ScrmReader_daf(r, 3)); + if(verbose) { + for(i=0; i Date: Sun, 7 Jan 2018 11:04:29 -0700 Subject: [PATCH 006/101] scrmpat compiles and runs. With the test data set, it says there are 3 nucleotide sites, but the true number if 4. --- src/Makefile | 5 ++ src/scrmpat.c | 125 +++++++++++++++++++++++--------------------------- 2 files changed, 63 insertions(+), 67 deletions(-) diff --git a/src/Makefile b/src/Makefile index c7da8467..c6411ee3 100644 --- a/src/Makefile +++ b/src/Makefile @@ -69,6 +69,11 @@ SITEPAT := sitepat.o misc.o binary.o lblndx.o parkeyval.o rafreader.o \ sitepat : $(SITEPAT) $(CC) $(CFLAGS) -o $@ $(SITEPAT) $(lib) +SCRMPAT := scrmpat.o misc.o binary.o lblndx.o parkeyval.o scrmreader.o \ + tokenizer.o boot.o error.o +scrmpat : $(SCRMPAT) + $(CC) $(CFLAGS) -o $@ $(SCRMPAT) $(lib) + DAF := daf.o misc.o daf : $(DAF) $(CC) $(CFLAGS) -o $@ $(DAF) $(lib) diff --git a/src/scrmpat.c b/src/scrmpat.c index 03226b65..9020324c 100644 --- a/src/scrmpat.c +++ b/src/scrmpat.c @@ -121,7 +121,7 @@ Systems Consortium License, which can be found in file "LICENSE". #include "binary.h" #include "boot.h" #include "misc.h" -#include "strint.h" +#include "scrmreader.h" #include "typedefs.h" #include "version.h" #include "error.h" @@ -236,7 +236,6 @@ int main(int argc, char **argv) { long bootreps = 0; double conf = 0.95; // confidence level long blocksize = 500; - StrInt *strint = StrInt_new(); char bootfname[FILENAMESIZE] = { '\0' }; char errbuff[100] = { '\0' }; const char *logfname = "scrmpat.log"; @@ -332,7 +331,7 @@ int main(int argc, char **argv) { // Parse remaining arguments, each of which should be an arbitrary // label. for(i = 0; i < n; ++i) { - fname[i] = poplbl[i] = argv[i + optind]; + poplbl[i] = argv[i + optind]; if(poplbl[i] == NULL || strlen(poplbl[i]) == 0 || strchr(poplbl[i], ':') != NULL) @@ -348,7 +347,7 @@ int main(int argc, char **argv) { } } if(ifp==stdin && (bootreps>0 || bootfname[0] != '\0')) { - fprintf(stderr, "%s:%s: Can't do bootstrap when input is stdin.\n", + fprintf(stderr, "%s:%d: Can't do bootstrap when input is stdin.\n", __FILE__,__LINE__); exit(EXIT_FAILURE); } @@ -365,9 +364,10 @@ int main(int argc, char **argv) { } printf("# scrmpat version %s\n", VERSION); - printf("# Population labels:\n"); + printf("# Population labels:"); for(i = 0; i < n; ++i) - printf("# %4s=%s\n", poplbl[i], fname[i]); + printf(" %s", poplbl[i]); + putchar('\n'); // make sure labels are all different for(i = 1; i < n; ++i) @@ -410,10 +410,17 @@ int main(int argc, char **argv) { // Used by bootstrap Boot *boot = NULL; int nchr = 0; - char prev[DAFSTRSIZE], chr[DAFSTRSIZE] = { '\0' }; + unsigned prev, chr=0; long nsnp[MAXCHR]; memset(nsnp, 0, sizeof nsnp); + ScrmReader *r=ScrmReader_new(ifp); + if(r == NULL) { + fprintf(stderr,"%s:%d: Can't read scrm output\n", + __FILE__,__LINE__); + exit(EXIT_FAILURE); + } + // Read the data to get dimensions: number of chromosomes and // number of snps per chromosome. Then use these dimensions to // allocate a bootstrap object. @@ -425,7 +432,15 @@ int main(int argc, char **argv) { // nsnp[i] {i=0..nchr-1} done = 0; while(!done) { - status = DAFReader_multiNext(n, r); + prev = chr; + chr = ScrmReader_chr(r); + if(prev != chr) { + nsnp[nchr] = 1; + ++nchr; + } else + ++nsnp[nchr - 1]; + + status = ScrmReader_next(r); switch(status) { case 0: break; @@ -442,28 +457,15 @@ int main(int argc, char **argv) { __FILE__,__LINE__, errbuff); exit(EXIT_FAILURE); } - - assert(strlen(DAFReader_chr(r[0])) < sizeof prev); - strcpy(prev, chr); - strcpy(chr, DAFReader_chr(r[0])); - int diff = strcmp(prev, chr); - if(diff != 0) { - StrInt_insert(strint, chr, nchr); - nsnp[nchr] = 1; - ++nchr; - } else - ++nsnp[nchr - 1]; } - for(i = 0; i < n; ++i) { - status = DAFReader_rewind(r[i]); - if(status) { - fprintf(stderr, "%s:%d: ERR: can't rewind input stream.\n", - __FILE__, __LINE__); - fprintf(stderr, " If --bootreps > 0, inputs must be" - " files, not pipes.\n"); - exit(EXIT_FAILURE); - } + status = ScrmReader_rewind(r); + if(status) { + fprintf(stderr, "%s:%d: ERR: can't rewind input stream.\n", + __FILE__, __LINE__); + fprintf(stderr, " If --bootreps > 0, inputs must be" + " files, not pipes.\n"); + exit(EXIT_FAILURE); } // Allocate Boot structure @@ -477,44 +479,15 @@ int main(int argc, char **argv) { unsigned long nsites = 0, nbadaa = 0, nfixed = 0; long snpndx = -1; - // Iterate through daf files + // Read data fprintf(stderr, "Doing %s pass through data to tabulate patterns..\n", bootreps > 0 ? "2nd" : "single"); int chrndx = -1, currChr = INT_MAX; - DAFReader_clearChromosomes(n, r); done=0; while(!done) { - status = DAFReader_multiNext(n, r); - switch(status) { - case 0: - ++nsites; - break; - case EOF: - done=1; - continue; - case ALLELE_MISMATCH: - case NO_ANCESTRAL_ALLELE: - ++nbadaa; - ++nsites; - continue; - default: - // something wrong - mystrerror_r(status, errbuff, sizeof errbuff); - fprintf(stderr,"%s:%d: input error (%s)\n", - __FILE__,__LINE__, errbuff); - exit(EXIT_FAILURE); - } - if(bootreps > 0) { // chrndx is index of current chromosome - errno = 0; - chrndx = StrInt_get(strint, DAFReader_chr(r[0])); - if(errno) { - fprintf(stderr, - "%s:%d: ERR: missing index for chromosome: %s\n", - __FILE__, __LINE__, DAFReader_chr(r[0])); - exit(EXIT_FAILURE); - } + chrndx = ScrmReader_chr(r); if(chrndx != currChr) { currChr = chrndx; snpndx = 0; @@ -528,13 +501,13 @@ int main(int argc, char **argv) { // p and q are frequencies of derived and ancestral alleles double p[n], q[n]; for(j = 0; j < n; ++j) { - p[j] = DAFReader_daf(r[j]); // derived allele freq + p[j] = ScrmReader_daf(r,j); // derived allele freq q[j] = 1.0 - p[j]; } if(logAll) { - fprintf(logfile, "%5s %10lu\n", DAFReader_chr(r[0]), - DAFReader_nucpos(r[0])); + fprintf(logfile, "%5u %10lu\n", ScrmReader_chr(r), + ScrmReader_nucpos(r)); } // Contribution of current snp to each site pattern. Inner // loop considers each bit in current pattern. If that bit is @@ -566,14 +539,34 @@ int main(int argc, char **argv) { assert(chrndx >= 0); Boot_add(boot, chrndx, snpndx, i, z); } + } #ifndef NDEBUG if(bootreps > 0) Boot_sanityCheck(boot, __FILE__, __LINE__); #endif - errno = 0; + status = ScrmReader_next(r); + switch(status) { + case 0: + ++nsites; + break; + case EOF: + done=1; + continue; + case ALLELE_MISMATCH: + case NO_ANCESTRAL_ALLELE: + ++nbadaa; + ++nsites; + continue; + default: + // something wrong + mystrerror_r(status, errbuff, sizeof errbuff); + fprintf(stderr,"%s:%d: input error (%s)\n", + __FILE__,__LINE__, errbuff); + exit(EXIT_FAILURE); + } } - printf("# Sites aligned across all populations: %lu\n", nsites); + printf("# Nucleotide sites: %lu\n", nsites); if(nbadaa) printf("# Disagreements about alleles : %lu\n", nbadaa); if(nfixed) @@ -634,11 +627,9 @@ int main(int argc, char **argv) { putchar('\n'); } - for(i = 0; i < n; ++i) - DAFReader_free(r[i]); + ScrmReader_free(r); if(bootreps > 0) Boot_free(boot); - StrInt_free(strint); if(logfile) fclose(logfile); fprintf(stderr, "scrmpat is finished\n"); From 9c96103e8dc10b7ef0494fe9446e52e03e05b09c Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sun, 7 Jan 2018 12:24:34 -0700 Subject: [PATCH 007/101] scrmpat works. --- src/Makefile | 6 ++-- src/scrmpat.c | 65 +++++++++++++++++++------------------- src/scrmreader.c | 82 +++++++++++++++++++++++++----------------------- 3 files changed, 79 insertions(+), 74 deletions(-) diff --git a/src/Makefile b/src/Makefile index c6411ee3..689a7f2f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -7,7 +7,7 @@ opt := -DNDEBUG -O3 -finline-functions # For full optimization prof := incl := -I/usr/local/include -I/opt/local/include -targets := legosim legofit tabpat sitepat daf raf numcores +targets := legosim legofit tabpat sitepat scrmpat daf raf numcores pytargets := diverg.py bootci.py flatfile.py tests := xzeroin xbinary @@ -43,7 +43,7 @@ test : $(tests) -./xzeroin @echo "ALL UNIT TESTS WERE COMPLETED." -version.h: +version : ./mkversion.py > version.h LEGOSIM := legosim.o patprob.o gptree.o binary.o jobqueue.o misc.o parse.o \ @@ -112,5 +112,5 @@ include depend .SUFFIXES: .SUFFIXES: .c .o -.PHONY: clean +.PHONY: clean version diff --git a/src/scrmpat.c b/src/scrmpat.c index 9020324c..a72e591b 100644 --- a/src/scrmpat.c +++ b/src/scrmpat.c @@ -258,7 +258,7 @@ int main(int argc, char **argv) { // command line arguments for(;;) { - i = getopt_long(argc, argv, "ab:c:f:hi:r:t:Fv", myopts, &optndx); + i = getopt_long(argc, argv, "ab:c:hi:r:t:Fv", myopts, &optndx); if(i == -1) break; switch (i) { @@ -432,14 +432,6 @@ int main(int argc, char **argv) { // nsnp[i] {i=0..nchr-1} done = 0; while(!done) { - prev = chr; - chr = ScrmReader_chr(r); - if(prev != chr) { - nsnp[nchr] = 1; - ++nchr; - } else - ++nsnp[nchr - 1]; - status = ScrmReader_next(r); switch(status) { case 0: @@ -457,6 +449,14 @@ int main(int argc, char **argv) { __FILE__,__LINE__, errbuff); exit(EXIT_FAILURE); } + + prev = chr; + chr = ScrmReader_chr(r); + if(prev != chr) { + nsnp[nchr] = 1; + ++nchr; + } else + ++nsnp[nchr - 1]; } status = ScrmReader_rewind(r); @@ -485,6 +485,27 @@ int main(int argc, char **argv) { int chrndx = -1, currChr = INT_MAX; done=0; while(!done) { + status = ScrmReader_next(r); + switch(status) { + case 0: + ++nsites; + break; + case EOF: + done=1; + continue; + case ALLELE_MISMATCH: + case NO_ANCESTRAL_ALLELE: + ++nbadaa; + ++nsites; + continue; + default: + // something wrong + mystrerror_r(status, errbuff, sizeof errbuff); + fprintf(stderr,"%s:%d: input error (%s)\n", + __FILE__,__LINE__, errbuff); + exit(EXIT_FAILURE); + } + if(bootreps > 0) { // chrndx is index of current chromosome chrndx = ScrmReader_chr(r); @@ -545,33 +566,13 @@ int main(int argc, char **argv) { if(bootreps > 0) Boot_sanityCheck(boot, __FILE__, __LINE__); #endif - status = ScrmReader_next(r); - switch(status) { - case 0: - ++nsites; - break; - case EOF: - done=1; - continue; - case ALLELE_MISMATCH: - case NO_ANCESTRAL_ALLELE: - ++nbadaa; - ++nsites; - continue; - default: - // something wrong - mystrerror_r(status, errbuff, sizeof errbuff); - fprintf(stderr,"%s:%d: input error (%s)\n", - __FILE__,__LINE__, errbuff); - exit(EXIT_FAILURE); - } } printf("# Nucleotide sites: %lu\n", nsites); if(nbadaa) - printf("# Disagreements about alleles : %lu\n", nbadaa); + printf("# Disagreements about alleles: %lu\n", nbadaa); if(nfixed) - printf("# Monomorphic sites : %lu\n", nfixed); - printf("# Sites used : %lu\n", + printf("# Monomorphic sites: %lu\n", nfixed); + printf("# Sites used: %lu\n", nsites - nbadaa - nfixed); // boottab[i][j] is the count of the j'th site pattern diff --git a/src/scrmreader.c b/src/scrmreader.c index 08918cfc..6f4d3cab 100644 --- a/src/scrmreader.c +++ b/src/scrmreader.c @@ -19,16 +19,16 @@ struct ScrmReader { // Independent replicates in scrm output appear as separate // chromosomes, which are labelled by unsigned integers. - unsigned chr; + int chr; // Position values in scrm output are ignored. Instead, ScrmReader // returns positions as a sequence of unsigned integers. - unsigned long nucpos; + long nucpos; Tokenizer *tkz; FILE *fp; }; -unsigned *countSamples(Tokenizer *tkz, int *npops); +unsigned *countSamples(Tokenizer *tkz, int *npops, int *transpose); int readuntil(int n, const char *str, int dim, char buff[dim], FILE *fp); // destructor @@ -48,7 +48,9 @@ void ScrmReader_free(ScrmReader *self) { // The i'th entry in this array is the haploid sample size of population i. // // On error, the function returns NULL. -unsigned *countSamples(Tokenizer *tkz, int *npops) { +unsigned *countSamples(Tokenizer *tkz, int *npops, int *transpose) { + + *transpose = 0; if(strcmp("scrm", Tokenizer_token(tkz, 0)) != 0) { fprintf(stderr,"%s:%d: input file is not scrm output\n", @@ -100,7 +102,8 @@ unsigned *countSamples(Tokenizer *tkz, int *npops) { } // advance to last argument of -I or -eI i += 1 + *npops; - } + }else if(0 == strcmp(token, "-transpose-segsites")) + *transpose = 1; } // Remove populations with zero samples *npops = removeZeroes(*npops, nsamples); @@ -139,7 +142,16 @@ ScrmReader *ScrmReader_new(FILE *fp) { Tokenizer_split(self->tkz, buff, " "); Tokenizer_strip(self->tkz, " \n"); - self->nsamples = countSamples(self->tkz, &self->npops); + int transpose; + self->nsamples = countSamples(self->tkz, &self->npops, &transpose); + if(!transpose) { + fprintf(stderr,"%s:%d: -transpose-segsites missing from scrm cmd\n", + __FILE__,__LINE__); + Tokenizer_free(self->tkz); + free(self->nsamples); + free(self); + return NULL; + } unsigned tot = 0; for(i=0; i < self->npops; ++i) { @@ -154,27 +166,11 @@ ScrmReader *ScrmReader_new(FILE *fp) { return NULL; } - // read to line beginning with "position" - status = readuntil(strlen("position"), "position", sizeof(buff), buff, fp); - if(status) { - free(self->nsamples); - ScrmReader_free(self); - return NULL; - } - // allocate daf array self->daf = malloc(self->npops * sizeof(self->daf[0])); CHECKMEM(self->daf); - // read 1st line of data - status = ScrmReader_next(self); - if(status) { - free(self->nsamples); - free(self->daf); - ScrmReader_free(self); - return NULL; - } - self->chr = self->nucpos = 0; + self->chr = self->nucpos = -1; return self; } @@ -193,23 +189,13 @@ int readuntil(int n, const char *str, int dim, char buff[dim], FILE *fp) { // Rewind input and reset chr and nucpos. Doesn't work // if input is stdin. int ScrmReader_rewind(ScrmReader *self) { - int status; - char buff[8192]; assert(self->fp != stdin); errno = 0; rewind(self->fp); if(errno) return errno; - // read to line beginning with "position" - status = readuntil(strlen("position"), "position", sizeof(buff), buff, - self->fp); - if(status) - return status; - status = ScrmReader_next(self); - if(status) - return status; - self->chr = self->nucpos = 0; + self->chr = self->nucpos = -1; return 0; } @@ -220,7 +206,7 @@ int ScrmReader_next(ScrmReader *self) { status = readline(sizeof(buff), buff, self->fp); if(status) return status; - if(strlen(stripWhiteSpace(buff)) == 0) { + if(self->chr == -1 || strlen(stripWhiteSpace(buff)) == 0) { // new chromosome status = readuntil(strlen("position"), "position", sizeof(buff), buff, self->fp); @@ -289,6 +275,8 @@ int ScrmReader_nsamples(ScrmReader *self, int i) { // Return frequency of derived allele in sample from population i. double ScrmReader_daf(ScrmReader *self, int i) { assert(i < self->npops); + assert(self->chr >= 0); + assert(self->nucpos >= 0); return self->daf[i]; } @@ -318,7 +306,7 @@ int main(int argc, char **argv) { } verbose = 1; } - int i, npops=0; + int i, npops=0, transpose; char buff[1000]; unsigned *nsamples; Tokenizer *tkz = Tokenizer_new(sizeof(buff)/2); @@ -327,13 +315,18 @@ int main(int argc, char **argv) { strcpy(buff, cmd); Tokenizer_split(tkz, buff, " "); Tokenizer_strip(tkz, " \n"); - nsamples = countSamples(tkz, &npops); + nsamples = countSamples(tkz, &npops, &transpose); if(verbose) { printf("countSamples returned: npops=%d; nsamples =", npops); for(i=0; i < npops; ++i) printf(" %u", nsamples[i]); putchar('\n'); } + if(!transpose) { + fprintf(stderr,"%s:%d: -transpose-segsites missing from scrm cmd\n", + __FILE__,__LINE__); + exit(1); + } unitTstResult("countSamples", "OK"); @@ -354,14 +347,19 @@ int main(int argc, char **argv) { rewind(fp); ScrmReader *r = ScrmReader_new(fp); - assert(0 == ScrmReader_chr(r)); - assert(0 == ScrmReader_nucpos(r)); + assert(-1 == ScrmReader_chr(r)); + assert(-1 == ScrmReader_nucpos(r)); int np = ScrmReader_npops(r); assert(np == 4); assert(6 == ScrmReader_nsamples(r, 0)); assert(6 == ScrmReader_nsamples(r, 1)); assert(4 == ScrmReader_nsamples(r, 2)); assert(2 == ScrmReader_nsamples(r, 3)); + + status = ScrmReader_next(r); + assert(status==0); + assert(0 == ScrmReader_chr(r)); + assert(0 == ScrmReader_nucpos(r)); // 0 0 0 0 0 0 | 0 0 0 0 0 0 | 0 0 0 0 | 1 1 assert(0.0 == ScrmReader_daf(r, 0)); assert(0.0 == ScrmReader_daf(r, 1)); @@ -420,6 +418,12 @@ int main(int argc, char **argv) { __FILE__,__LINE__,status); } + status = ScrmReader_next(r); + if(status) { + fprintf(stderr,"%s:%d: ScrmReader_next returned %d\n", + __FILE__,__LINE__,status); + } + assert(0 == ScrmReader_chr(r)); assert(0 == ScrmReader_nucpos(r)); np = ScrmReader_npops(r); From bdc62ecda11ccf0ce0d40374f2d6e64cf656f9e6 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sun, 7 Jan 2018 14:04:25 -0700 Subject: [PATCH 008/101] Modified src/xmisc.c so that it removes its tmp file. Added lines to .gitignore. --- .gitignore | 2 ++ test/xmisc.c | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 44fbbfbf..859f8c98 100644 --- a/.gitignore +++ b/.gitignore @@ -29,11 +29,13 @@ xjobqueue xdafreader xrafreader xmisc +xscrmreader xstrint legosim numcores legofit tabpat +scrmpat sitepat tabpat.log daf diff --git a/test/xmisc.c b/test/xmisc.c index b0d8b2d3..8260efef 100644 --- a/test/xmisc.c +++ b/test/xmisc.c @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef NDEBUG # error "Unit tests must be compiled without -DNDEBUG flag" @@ -161,8 +162,9 @@ int main(int argc, char **argv) { fputs("123456789", fp); fclose(fp); + const char *tmpfile = "xmisc.tmp"; int status; - fp=fopen("xmisc.tmp", "r"); + fp=fopen(tmpfile, "r"); status = readline(5, buff, fp); assert(status == BUFFER_OVERFLOW); rewind(fp); @@ -179,5 +181,6 @@ int main(int argc, char **argv) { unitTstResult("readline", "OK"); + unlink(tmpfile); return 0; } From 895dd5104e44df1974c2dc4121c1b197b8b6f4ed Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sun, 7 Jan 2018 14:47:34 -0700 Subject: [PATCH 009/101] Removed unused variable "ntokens" from ScrmReader_next. --- src/scrmreader.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scrmreader.c b/src/scrmreader.c index 6f4d3cab..03f065e0 100644 --- a/src/scrmreader.c +++ b/src/scrmreader.c @@ -202,7 +202,7 @@ int ScrmReader_rewind(ScrmReader *self) { // Move ScrmReader to next nucleotide site. int ScrmReader_next(ScrmReader *self) { char buff[8192]; - int status, ntokens; + int status; status = readline(sizeof(buff), buff, self->fp); if(status) return status; @@ -221,7 +221,7 @@ int ScrmReader_next(ScrmReader *self) { }else ++self->nucpos; Tokenizer_split(self->tkz, buff, " "); - ntokens = Tokenizer_strip(self->tkz, " \n"); + Tokenizer_strip(self->tkz, " \n"); // calculate derived allele frequency w/i each pop double nderived; From b07d13f7631c003c97b6dfcb63ea5df206ac823c Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Thu, 11 Jan 2018 14:54:46 -0700 Subject: [PATCH 010/101] Removed mkversion.py and added version.h. --- .gitignore | 1 - src/Makefile | 4 ---- src/mkversion.py | 12 ------------ src/version.h | 3 +++ 4 files changed, 3 insertions(+), 17 deletions(-) delete mode 100755 src/mkversion.py create mode 100644 src/version.h diff --git a/.gitignore b/.gitignore index 859f8c98..70cf2b46 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,6 @@ core depend raf -version.h xboot xdtnorm xexopar diff --git a/src/Makefile b/src/Makefile index 689a7f2f..4cd9b99b 100644 --- a/src/Makefile +++ b/src/Makefile @@ -43,9 +43,6 @@ test : $(tests) -./xzeroin @echo "ALL UNIT TESTS WERE COMPLETED." -version : - ./mkversion.py > version.h - LEGOSIM := legosim.o patprob.o gptree.o binary.o jobqueue.o misc.o parse.o \ branchtab.o popnodetab.o lblndx.o tokenizer.o parstore.o parkeyval.o \ popnode.o gene.o dprintf.o rngseed.o dtnorm.o tinyexpr.o @@ -112,5 +109,4 @@ include depend .SUFFIXES: .SUFFIXES: .c .o -.PHONY: clean version diff --git a/src/mkversion.py b/src/mkversion.py deleted file mode 100755 index dbfbc80c..00000000 --- a/src/mkversion.py +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/python -import shlex, subprocess - -cmd = 'git log -n 1 --date=short --format=format:"rev.%ad.%h" HEAD' -args = shlex.split(cmd) -p = subprocess.Popen(args, stdout=subprocess.PIPE) -sha = p.stdout.readline() -p.terminate() - -print "#ifndef VERSION" -print '#define VERSION "%s"' % sha -print "#endif" diff --git a/src/version.h b/src/version.h new file mode 100644 index 00000000..074929b9 --- /dev/null +++ b/src/version.h @@ -0,0 +1,3 @@ +#ifndef VERSION +#define VERSION "1.0" +#endif From 6967135f79e873b71259eec1f2341e7d3831e8d7 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Fri, 12 Jan 2018 09:33:34 -0700 Subject: [PATCH 011/101] Modified the code that scrmpat uses to count samples in the scrm command line. Previously, all samples from a given population were conflated. Now, samples of different ages are kept separate. For example, if the scrm command line included "-I 2 1 1 -eI 0.1 0 1", the old code would have generated an array with 2 entries: [1,2]. The first entry would have been the number of samples in population 0, the second would have been the number of samples in population 2. In the new code, this same input will generate an array with 3 entries: [1,1,1], which correspond to the number of samples in population 0, that in population 1 for time 0, and that in population 1 for time 0.1. The number of labels listed on the scrmpat command line must correspond to the dimension of the array generated by the new code. --- src/scrmpat.c | 11 ++- src/scrmreader.c | 202 +++++++++++++++++++++++++++++++++++------------ src/scrmreader.h | 4 +- 3 files changed, 164 insertions(+), 53 deletions(-) diff --git a/src/scrmpat.c b/src/scrmpat.c index a72e591b..19fed5c6 100644 --- a/src/scrmpat.c +++ b/src/scrmpat.c @@ -45,7 +45,7 @@ bootstrap, writing each bootstrap replicate into a separate file. should include the option `-transpost-segsites`. Let us assume you have done this, that file `foo.scrm` contains the output simulated by `scrm`, and that these simulated data included genotypes referring -to four populations, labelled "x", "y", "n", and "d". The `scrmpat` +to four populations, labeled "x", "y", "n", and "d". The `scrmpat` command woule look like this: ~/daf contains a separate daf file for each population. We want to compare 4 populations, whose .daf files are `yri.daf`, `ceu.daf`, @@ -313,7 +313,7 @@ int main(int argc, char **argv) { } // remaining options: population labels - int n = argc - optind; // number of input files + int n = argc - optind; // number of population labels if(n == 0) usage(); @@ -420,6 +420,13 @@ int main(int argc, char **argv) { __FILE__,__LINE__); exit(EXIT_FAILURE); } + if(n != ScrmReader_sampleDim(r)) { + fprintf(stderr,"%s:%d:" + " Number (%d) of labels != dimension (%d) of sample array\n" + "in scrm output.\n", + __FILE__,__LINE__,n,ScrmReader_sampleDim(r)); + exit(EXIT_FAILURE); + } // Read the data to get dimensions: number of chromosomes and // number of snps per chromosome. Then use these dimensions to diff --git a/src/scrmreader.c b/src/scrmreader.c index 03f065e0..467a89dd 100644 --- a/src/scrmreader.c +++ b/src/scrmreader.c @@ -12,8 +12,15 @@ #include #include +typedef struct FIFOstack FIFOstack; + +struct FIFOstack { + struct FIFOstack *next; + unsigned value; +}; + struct ScrmReader { - int npops; + int sampleDim; unsigned *nsamples; double *daf; @@ -28,8 +35,54 @@ struct ScrmReader { FILE *fp; }; -unsigned *countSamples(Tokenizer *tkz, int *npops, int *transpose); +unsigned *countSamples(Tokenizer *tkz, int *sampleDim, int *transpose); int readuntil(int n, const char *str, int dim, char buff[dim], FILE *fp); +FIFOstack *FIFOstack_push(FIFOstack *prev, unsigned val); +FIFOstack *FIFOstack_pop(FIFOstack *self, unsigned *value); +int FIFOstack_length(FIFOstack *self); + +// Push a value onto the tail of the stack. Return pointer to new +// head. Example: +// +// FIFOstack *stack=NULL; +// stack = FIFOstack_push(stack, 1u); +// stack = FIFOstack_push(stack, 2u); +FIFOstack *FIFOstack_push(FIFOstack *self, unsigned value) { + if(self != NULL) { + self->next = FIFOstack_push(self->next, value); + return self; + } + FIFOstack *new = malloc(sizeof(FIFOstack)); + CHECKMEM(new); + new->value = value; + new->next = NULL; + return new; +} + +// Pop a value off the head of the stack. Return pointer to new +// head. Example: +// +// FIFOstack *stack=NULL; +// stack = FIFOstack_push(stack, 1u); +// stack = FIFOstack_push(stack, 2u); +// +// unsigned x; +// stack = FIFOstack_pop(stack, &x); // x=1 +// stack = FIFOstack_pop(stack, &x); // x=2 +FIFOstack *FIFOstack_pop(FIFOstack *self, unsigned *value) { + if(self==NULL) + return NULL; + *value = self->value; + FIFOstack *next = self->next; + free(self); + return next; +} + +int FIFOstack_length(FIFOstack *self) { + if(self==NULL) + return 0; + return 1 + FIFOstack_length(self->next); +} // destructor void ScrmReader_free(ScrmReader *self) { @@ -44,11 +97,11 @@ void ScrmReader_free(ScrmReader *self) { // scrm command line, and nsamples should point to an int. // // The function returns a newly-allocated array of ints, whose dimension -// is *npops, the number of populations specified on the scrm command line. +// is *sampleDim, the number of populations specified on the scrm command line. // The i'th entry in this array is the haploid sample size of population i. // // On error, the function returns NULL. -unsigned *countSamples(Tokenizer *tkz, int *npops, int *transpose) { +unsigned *countSamples(Tokenizer *tkz, int *sampleDim, int *transpose) { *transpose = 0; @@ -61,52 +114,71 @@ unsigned *countSamples(Tokenizer *tkz, int *npops, int *transpose) { int i, j; long unsigned h; char *token, *end; - unsigned *nsamples = NULL; + FIFOstack **fifo=NULL; // array of FIFOstack objects, one per population int ntokens = Tokenizer_ntokens(tkz); - - *npops=0; + int npops=0; // Read through tokens, looking for -I and -eI. Use these arguments - // to set npops and nsamples. + // to set npops and array of fifo stacks. for(i=1; i < ntokens; ++i) { token = Tokenizer_token(tkz, i); if(strcmp("-I", token) == 0 || strcmp("-eI", token) == 0) { - if(*npops == 0) { + if(npops == 0) { // count populations and allocate nsamples - assert(nsamples == NULL); + assert(fifo == NULL); for(j=i+2; j 0); - assert(nsamples != NULL); - for(j=0; j < *npops; ++j) { + // increment fifo stacks + assert(npops > 0); + assert(fifo != NULL); + for(j=0; j < npops; ++j) { token = Tokenizer_token(tkz, i+2+j); h = strtoul(token, &end, 10); assert(end != token); - nsamples[j] += h; + if(h>0) + fifo[j] = FIFOstack_push(fifo[j], h); } // advance to last argument of -I or -eI - i += 1 + *npops; + i += 1 + npops; }else if(0 == strcmp(token, "-transpose-segsites")) *transpose = 1; } - // Remove populations with zero samples - *npops = removeZeroes(*npops, nsamples); + + *sampleDim=0; + for(j=0; j < npops; ++j) + *sampleDim += FIFOstack_length(fifo[j]); + assert(*sampleDim > 0); + unsigned *nsamples = malloc(*sampleDim * sizeof(nsamples[0])); + CHECKMEM(nsamples); + for(i=j=0; i < npops; ++i) { + unsigned n; + while(fifo[i]) { + fifo[i] = FIFOstack_pop(fifo[i], &n); + nsamples[j++] = n; + } + } + assert(j == *sampleDim); +#ifndef NDEBUG + for(i=0; i < npops; ++i) + assert(fifo[i] == NULL); +#endif + free(fifo); + return nsamples; } @@ -143,7 +215,7 @@ ScrmReader *ScrmReader_new(FILE *fp) { Tokenizer_strip(self->tkz, " \n"); int transpose; - self->nsamples = countSamples(self->tkz, &self->npops, &transpose); + self->nsamples = countSamples(self->tkz, &self->sampleDim, &transpose); if(!transpose) { fprintf(stderr,"%s:%d: -transpose-segsites missing from scrm cmd\n", __FILE__,__LINE__); @@ -154,7 +226,7 @@ ScrmReader *ScrmReader_new(FILE *fp) { } unsigned tot = 0; - for(i=0; i < self->npops; ++i) { + for(i=0; i < self->sampleDim; ++i) { tot += self->nsamples[i]; } unsigned tot2 = strtoul(Tokenizer_token(self->tkz, 1), NULL, 10); @@ -167,7 +239,7 @@ ScrmReader *ScrmReader_new(FILE *fp) { } // allocate daf array - self->daf = malloc(self->npops * sizeof(self->daf[0])); + self->daf = malloc(self->sampleDim * sizeof(self->daf[0])); CHECKMEM(self->daf); self->chr = self->nucpos = -1; @@ -228,7 +300,7 @@ int ScrmReader_next(ScrmReader *self) { int pop, i; int start=2; // skip 1st two columns char *token, *end; - for(pop=0; pop < self->npops; ++pop) { + for(pop=0; pop < self->sampleDim; ++pop) { nderived = 0.0; for(i=start; i < start + self->nsamples[pop]; ++i) { if(i >= Tokenizer_ntokens(self->tkz)) { @@ -262,19 +334,19 @@ unsigned long ScrmReader_nucpos(ScrmReader *self) { } // Return number of populations. -int ScrmReader_npops(ScrmReader *self) { - return self->npops; +int ScrmReader_sampleDim(ScrmReader *self) { + return self->sampleDim; } // Return number of samples from population i. int ScrmReader_nsamples(ScrmReader *self, int i) { - assert(i < self->npops); + assert(i < self->sampleDim); return self->nsamples[i]; } // Return frequency of derived allele in sample from population i. double ScrmReader_daf(ScrmReader *self, int i) { - assert(i < self->npops); + assert(i < self->sampleDim); assert(self->chr >= 0); assert(self->nucpos >= 0); return self->daf[i]; @@ -286,6 +358,7 @@ double ScrmReader_daf(ScrmReader *self, int i) { # error "Unit tests must be compiled without -DNDEBUG flag" # endif +// nsamples = [6, 6, 2, 2, 2] const char *cmd = "scrm 18 2 -l 100r -t 1.35351 -r 0.966782 1000" " -transpose-segsites -SC abs -I 5 6 6 0 0 0 -eI 0.0192475 0 0 2 0 0" " -eI 0.00561032 0 0 2 0 0 -eI 0.0117678 0 0 0 2 0 -n 1 2.0687" @@ -306,30 +379,53 @@ int main(int argc, char **argv) { } verbose = 1; } - int i, npops=0, transpose; + + // test FIFOstack + FIFOstack *stack = NULL; + stack = FIFOstack_push(stack, 1u); + stack = FIFOstack_push(stack, 2u); + assert(2 == FIFOstack_length(stack)); + unsigned x=0; + stack = FIFOstack_pop(stack, &x); + assert(1u == x); + assert(1 == FIFOstack_length(stack)); + stack = FIFOstack_pop(stack, &x); + assert(2u == x); + assert(0 == FIFOstack_length(stack)); + unitTstResult("FIFOstack", "OK"); + + int i, sampleDim=0, transpose; char buff[1000]; unsigned *nsamples; Tokenizer *tkz = Tokenizer_new(sizeof(buff)/2); CHECKMEM(tkz); + // test countSamples strcpy(buff, cmd); Tokenizer_split(tkz, buff, " "); Tokenizer_strip(tkz, " \n"); - nsamples = countSamples(tkz, &npops, &transpose); + nsamples = countSamples(tkz, &sampleDim, &transpose); if(verbose) { - printf("countSamples returned: npops=%d; nsamples =", npops); - for(i=0; i < npops; ++i) + printf("countSamples returned: sampleDim=%d; nsamples =", sampleDim); + for(i=0; i < sampleDim; ++i) printf(" %u", nsamples[i]); putchar('\n'); } + printf("sampleDim=%d\n", sampleDim); + assert(sampleDim == 5); + assert(nsamples[0] == 6); + assert(nsamples[1] == 6); + assert(nsamples[2] == 2); + assert(nsamples[3] == 2); + assert(nsamples[4] == 2); if(!transpose) { fprintf(stderr,"%s:%d: -transpose-segsites missing from scrm cmd\n", __FILE__,__LINE__); exit(1); } - unitTstResult("countSamples", "OK"); + // test readuntil FILE *fp = fopen("output.scrm", "r"); int status; assert(fp); @@ -345,26 +441,29 @@ int main(int argc, char **argv) { } unitTstResult("readuntil", "OK"); + // test scrmreader rewind(fp); ScrmReader *r = ScrmReader_new(fp); assert(-1 == ScrmReader_chr(r)); assert(-1 == ScrmReader_nucpos(r)); - int np = ScrmReader_npops(r); - assert(np == 4); + int np = ScrmReader_sampleDim(r); + assert(np == 5); assert(6 == ScrmReader_nsamples(r, 0)); assert(6 == ScrmReader_nsamples(r, 1)); - assert(4 == ScrmReader_nsamples(r, 2)); + assert(2 == ScrmReader_nsamples(r, 2)); assert(2 == ScrmReader_nsamples(r, 3)); + assert(2 == ScrmReader_nsamples(r, 4)); status = ScrmReader_next(r); assert(status==0); assert(0 == ScrmReader_chr(r)); assert(0 == ScrmReader_nucpos(r)); - // 0 0 0 0 0 0 | 0 0 0 0 0 0 | 0 0 0 0 | 1 1 + // 0 0 0 0 0 0 | 0 0 0 0 0 0 | 0 0 | 0 0 | 1 1 assert(0.0 == ScrmReader_daf(r, 0)); assert(0.0 == ScrmReader_daf(r, 1)); assert(0.0 == ScrmReader_daf(r, 2)); - assert(1.0 == ScrmReader_daf(r, 3)); + assert(0.0 == ScrmReader_daf(r, 3)); + assert(1.0 == ScrmReader_daf(r, 4)); if(verbose) { for(i=0; i Date: Fri, 12 Jan 2018 09:51:27 -0700 Subject: [PATCH 012/101] Cosmetic --- src/scrmpat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scrmpat.c b/src/scrmpat.c index 19fed5c6..683800ab 100644 --- a/src/scrmpat.c +++ b/src/scrmpat.c @@ -401,7 +401,7 @@ int main(int argc, char **argv) { } // Sort site patterns. Major sort is by number of "on" bits, - // so that doubleton patterns come first, then tripletons, ets. + // so that singleton patterns come first, then doubletons, etc. // Secondary sort is by order in which labels are listed // on the command line. qsort(pat, (size_t) npat, sizeof(pat[0]), compare_tipId); From 9163e2ad906f3a759aabbf734ff722ab7da855d4 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Fri, 12 Jan 2018 09:52:22 -0700 Subject: [PATCH 013/101] Increment version --- src/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/version.h b/src/version.h index 074929b9..52c78d78 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.0" +#define VERSION "1.1" #endif From b268df328b0eb458542d57b67efe02d08e19c784 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sat, 13 Jan 2018 10:43:25 -0700 Subject: [PATCH 014/101] Modified the logic within src/scrmreader.c that checks to see that a token is an unsigned integer. Previous code assumed that strtoul would reject a token such as "-1". However, it reads it without problem and interprets the result as a positive integer in two's complement. The new code uses strtol unstead of strtoul and then checks that the result is non-negative. --- src/scrmreader.c | 16 ++++++++-------- src/try.c | 35 ++++++++++++++++++++++++++++++++--- 2 files changed, 40 insertions(+), 11 deletions(-) diff --git a/src/scrmreader.c b/src/scrmreader.c index 467a89dd..b8aafcdc 100644 --- a/src/scrmreader.c +++ b/src/scrmreader.c @@ -11,6 +11,7 @@ #include #include #include +#include typedef struct FIFOstack FIFOstack; @@ -112,7 +113,7 @@ unsigned *countSamples(Tokenizer *tkz, int *sampleDim, int *transpose) { } int i, j; - long unsigned h; + long h; char *token, *end; FIFOstack **fifo=NULL; // array of FIFOstack objects, one per population int ntokens = Tokenizer_ntokens(tkz); @@ -128,10 +129,10 @@ unsigned *countSamples(Tokenizer *tkz, int *sampleDim, int *transpose) { assert(fifo == NULL); for(j=i+2; j=0); if(h>0) - fifo[j] = FIFOstack_push(fifo[j], h); + fifo[j] = FIFOstack_push(fifo[j], (unsigned) h); } // advance to last argument of -I or -eI i += 1 + npops; @@ -411,7 +412,6 @@ int main(int argc, char **argv) { printf(" %u", nsamples[i]); putchar('\n'); } - printf("sampleDim=%d\n", sampleDim); assert(sampleDim == 5); assert(nsamples[0] == 6); assert(nsamples[1] == 6); diff --git a/src/try.c b/src/try.c index ff1a6127..e65e95ab 100644 --- a/src/try.c +++ b/src/try.c @@ -1,7 +1,36 @@ #include -#include +#include +#include int main(void) { - printf("%lf\n", 1e10*DBL_EPSILON); - return 0; + long h; + char token[100], *end; + strcpy(token, "-1"); + + h = strtol(token, &end, 10); + if(end==token || h<0) // token isn't a nonnegative integer + printf("%s is Not a nonnegative integer\n", token); + else // token is a nonnegative integer + printf("%s IS a nonnegative integer: value=%ld\n", token, h); + + strcpy(token, " -eI"); + h = strtol(token, &end, 10); + if(end==token || h<0) // token isn't a nonnegative integer + printf("%s is Not a nonnegative integer\n", token); + else // token is a nonnegative integer + printf("%s IS a nonnegative integer: value=%ld\n", token, h); + + strcpy(token, " -1"); + h = strtol(token, &end, 10); + if(end==token || h<0) // token isn't a nonnegative integer + printf("%s is Not a nonnegative integer\n", token); + else // token is a nonnegative integer + printf("%s IS a nonnegative integer: value=%ld\n", token, h); + + strcpy(token, " 123 "); + h = strtol(token, &end, 10); + if(end==token || h<0) // token isn't a nonnegative integer + printf("%s is Not a nonnegative integer\n", token); + else // token is a nonnegative integer + printf("%s IS a nonnegative integer: value=%ld\n", token, h); } From dad40a4f38f9a965be53d0664e05ee6790f113d3 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sat, 13 Jan 2018 11:08:01 -0700 Subject: [PATCH 015/101] Made a sanity check permanent within scrmreader.c. The previous code used an "assert" statement to check that population sizes were non-negative integers, so this check wasn't done when NDEBUG was defined. Now it's done all the time. --- src/scrmreader.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/scrmreader.c b/src/scrmreader.c index b8aafcdc..b3b076f2 100644 --- a/src/scrmreader.c +++ b/src/scrmreader.c @@ -40,6 +40,7 @@ unsigned *countSamples(Tokenizer *tkz, int *sampleDim, int *transpose); int readuntil(int n, const char *str, int dim, char buff[dim], FILE *fp); FIFOstack *FIFOstack_push(FIFOstack *prev, unsigned val); FIFOstack *FIFOstack_pop(FIFOstack *self, unsigned *value); +FIFOstack *FIFOstack_free(FIFOstack *self); int FIFOstack_length(FIFOstack *self); // Push a value onto the tail of the stack. Return pointer to new @@ -85,6 +86,15 @@ int FIFOstack_length(FIFOstack *self) { return 1 + FIFOstack_length(self->next); } +FIFOstack *FIFOstack_free(FIFOstack *self) { + if(self) { + self->next = FIFOstack_free(self->next); + free(self); + } + return NULL; +} + + // destructor void ScrmReader_free(ScrmReader *self) { assert(self); @@ -112,7 +122,7 @@ unsigned *countSamples(Tokenizer *tkz, int *sampleDim, int *transpose) { return NULL; } - int i, j; + int i, j, k; long h; char *token, *end; FIFOstack **fifo=NULL; // array of FIFOstack objects, one per population @@ -150,7 +160,15 @@ unsigned *countSamples(Tokenizer *tkz, int *sampleDim, int *transpose) { for(j=0; j < npops; ++j) { token = Tokenizer_token(tkz, i+2+j); h = strtol(token, &end, 10); - assert(end != token && h>=0); + if(end == token || h < 0) { + fprintf(stderr,"%s:%d: read \"%s\" when" + " expecting a sample size\n", + __FILE__, __LINE__, token); + for(k=0; k0) fifo[j] = FIFOstack_push(fifo[j], (unsigned) h); } @@ -393,6 +411,12 @@ int main(int argc, char **argv) { stack = FIFOstack_pop(stack, &x); assert(2u == x); assert(0 == FIFOstack_length(stack)); + + stack = NULL; + stack = FIFOstack_push(stack, 1u); + stack = FIFOstack_push(stack, 2u); + stack = FIFOstack_free(stack); + assert(stack == NULL); unitTstResult("FIFOstack", "OK"); int i, sampleDim=0, transpose; From 87e94c59d8ed7386aac92a77cd1d9f3cea507ad5 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sat, 13 Jan 2018 16:00:04 -0700 Subject: [PATCH 016/101] Increased dimension of hash table in branchtab.c and moved the modulus operation outside of the hash function. In other words, the hash function now returns a 32-bit integer, and the calling function must use modulus or binary "and" to reduce this to the dimension of the hash table. --- src/branchtab.c | 10 +++++----- src/scrmpat.c | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/branchtab.c b/src/branchtab.c index 2d0f8116..54bdc97e 100644 --- a/src/branchtab.c +++ b/src/branchtab.c @@ -21,7 +21,7 @@ #include /// Dimension of hash table. Must be a power of 2 -#define BT_DIM 16u +#define BT_DIM 256u /// Make sure BT_DIM is a power of 2 #if (BT_DIM==0u || (BT_DIM & (BT_DIM-1u))) @@ -72,7 +72,7 @@ uint32_t tipIdHash( uint32_t key) { key = (key+0xd3a2646c) ^ (key<<9); key = (key+0xfd7046c5) + (key<<3); key = (key^0xb55a4f09) ^ (key>>16); - return key & (BT_DIM-1); + return key; } #elif TIPID_SIZE==64 /// Hash function for a 64-bit integer. @@ -83,7 +83,7 @@ uint32_t tipIdHash(uint64_t key) { key = key ^ (key >> 11); key = key + (key << 6); key = key ^ (key >> 22); - return (uint32_t) (key & (BT_DIM-1)); + return (uint32_t) key; } #else #error "Can't compile tipIdHash function. See branchtab.c" @@ -256,7 +256,7 @@ int BranchTab_hasSingletons(BranchTab * self) { /// Return value corresponding to key, or nan if no value is found. double BranchTab_get(BranchTab * self, tipId_t key) { - unsigned h = tipIdHash(key); + unsigned h = tipIdHash(key) & (BT_DIM-1u); assert(h < BT_DIM); assert(self); return BTLink_get(self->tab[h], key); @@ -266,7 +266,7 @@ double BranchTab_get(BranchTab * self, tipId_t key) { /// old one. void BranchTab_add(BranchTab * self, tipId_t key, double value) { assert(!self->frozen); - unsigned h = tipIdHash(key); + unsigned h = tipIdHash(key) & (BT_DIM-1u); assert(h < BT_DIM); assert(self); self->tab[h] = BTLink_add(self->tab[h], key, value); diff --git a/src/scrmpat.c b/src/scrmpat.c index 683800ab..9662dfd3 100644 --- a/src/scrmpat.c +++ b/src/scrmpat.c @@ -196,7 +196,7 @@ static void Stack_free(Stack * stk) { free(stk); } -/// Add an entry to the stack, checking bounds. +/// Add an entry to the tail of the stack, checking bounds. static void Stack_push(Stack * self, tipId_t x) { if(self->nused == self->dim) { fprintf(stderr, "%s:%s:%d ERR: buffer overflow\n", From 9491654c8ad5acc1ff5d30107f0b5c64433d5ea1 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sat, 13 Jan 2018 16:35:39 -0700 Subject: [PATCH 017/101] Changed tipIdHash to an inline function, which calls either uint32Hash or uint64Hash, depending of the size of tipId_t. These latter functions are defined in binary.c rather than in branchtab.c. --- src/binary.c | 23 +++++++++++++++++++++++ src/binary.h | 2 ++ src/branchtab.c | 29 +++++++---------------------- test/xbinary.c | 6 ++++++ 4 files changed, 38 insertions(+), 22 deletions(-) diff --git a/src/binary.c b/src/binary.c index 3650b786..d8606de5 100644 --- a/src/binary.c +++ b/src/binary.c @@ -131,3 +131,26 @@ int num1bits(tipId_t x) { } return n; } + +/// Hash function for a 32-bit integer. From Thomas Wang's 1997 +/// article: /// https://gist.github.com/badboy/6267743 +uint32_t uint32Hash( uint32_t key) { + key = (key+0x7ed55d16) + (key<<12); + key = (key^0xc761c23c) ^ (key>>19); + key = (key+0x165667b1) + (key<<5); + key = (key+0xd3a2646c) ^ (key<<9); + key = (key+0xfd7046c5) + (key<<3); + key = (key^0xb55a4f09) ^ (key>>16); + return key; +} + +/// Hash function for a 64-bit integer. +uint32_t uint64Hash(uint64_t key) { + key = (~key) + (key << 18); // key = (key << 18) - key - 1; + key = key ^ (key >> 31); + key = key * 21; // key = (key + (key << 2)) + (key << 4); + key = key ^ (key >> 11); + key = key + (key << 6); + key = key ^ (key >> 22); + return (uint32_t) key; +} diff --git a/src/binary.h b/src/binary.h index 9d36eb07..f119ce1c 100644 --- a/src/binary.h +++ b/src/binary.h @@ -12,6 +12,8 @@ int num1bits(tipId_t x); tipId_t reverseBits(tipId_t x); uint32_t rev32(uint32_t x); uint64_t rev64(uint64_t x); +uint32_t uint32Hash( uint32_t key); +uint32_t uint64Hash(uint64_t key); static inline int isPow2(tipId_t x); static inline int isPow2(tipId_t x) { diff --git a/src/branchtab.c b/src/branchtab.c index 54bdc97e..96eadf4f 100644 --- a/src/branchtab.c +++ b/src/branchtab.c @@ -21,7 +21,7 @@ #include /// Dimension of hash table. Must be a power of 2 -#define BT_DIM 256u +#define BT_DIM 128u /// Make sure BT_DIM is a power of 2 #if (BT_DIM==0u || (BT_DIM & (BT_DIM-1u))) @@ -57,33 +57,18 @@ BTLink *BTLink_dup(const BTLink *self); int BTLink_equals(const BTLink *lhs, const BTLink *rhs); #if TIPID_SIZE==32 -uint32_t tipIdHash(uint32_t key); +static inline uint32_t tipIdHash(uint32_t key); #elif TIPID_SIZE==64 -uint32_t tipIdHash(uint64_t key); +static inline uint32_t tipIdHash(uint64_t key); #endif #if TIPID_SIZE==32 -/// Hash function for a 32-bit integer. From Thomas Wang's 1997 -/// article: /// https://gist.github.com/badboy/6267743 -uint32_t tipIdHash( uint32_t key) { - key = (key+0x7ed55d16) + (key<<12); - key = (key^0xc761c23c) ^ (key>>19); - key = (key+0x165667b1) + (key<<5); - key = (key+0xd3a2646c) ^ (key<<9); - key = (key+0xfd7046c5) + (key<<3); - key = (key^0xb55a4f09) ^ (key>>16); - return key; +static inline uint32_t tipIdHash( uint32_t key) { + return uint32Hash(key); } #elif TIPID_SIZE==64 -/// Hash function for a 64-bit integer. -uint32_t tipIdHash(uint64_t key) { - key = (~key) + (key << 18); // key = (key << 18) - key - 1; - key = key ^ (key >> 31); - key = key * 21; // key = (key + (key << 2)) + (key << 4); - key = key ^ (key >> 11); - key = key + (key << 6); - key = key ^ (key >> 22); - return (uint32_t) key; +static inline uint64_t tipIdHash( uint64_t key) { + return uint64Hash(key); } #else #error "Can't compile tipIdHash function. See branchtab.c" diff --git a/test/xbinary.c b/test/xbinary.c index 5f054257..bfaf5cc2 100644 --- a/test/xbinary.c +++ b/test/xbinary.c @@ -95,5 +95,11 @@ int main(void) { printBits(sizeof(r64), &r64, stdout); putchar('\n'); + uint32_t key32 = 1234u; + printf("32-bit key %u -> hash %u\n", key32, uint32Hash(key32)); + + uint64_t key64 = 1234u; + printf("64-bit key %lu -> hash %u\n", key64, uint64Hash(key64)); + return 0; } From f71295a627da2568458f1e2292a4a2f5568f4271 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sat, 13 Jan 2018 16:43:06 -0700 Subject: [PATCH 018/101] . --- src/binary.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/binary.c b/src/binary.c index d8606de5..525b56ce 100644 --- a/src/binary.c +++ b/src/binary.c @@ -133,7 +133,8 @@ int num1bits(tipId_t x) { } /// Hash function for a 32-bit integer. From Thomas Wang's 1997 -/// article: /// https://gist.github.com/badboy/6267743 +/// article: +/// https://gist.github.com/badboy/6267743 uint32_t uint32Hash( uint32_t key) { key = (key+0x7ed55d16) + (key<<12); key = (key^0xc761c23c) ^ (key>>19); From 40a52a0861531673cfc0030a11293d409493d8ed Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Mon, 15 Jan 2018 11:58:29 -0700 Subject: [PATCH 019/101] . --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 70cf2b46..dc887936 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ *~ core depend +doc raf xboot xdtnorm From 911be5e3f75fe85ea4c7dfed87cd577ccad9e47c Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Mon, 22 Jan 2018 17:16:48 -0700 Subject: [PATCH 020/101] Made error msg more informative. --- src/legofit.c | 2 +- src/parse.c | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/legofit.c b/src/legofit.c index 1686d818..225864b3 100644 --- a/src/legofit.c +++ b/src/legofit.c @@ -129,7 +129,7 @@ reported in the legofit output. To double this value, use "-T 2e-4" or @copyright Copyright (c) 2016, 2017, Alan R. Rogers . This file is released under the Internet Systems Consortium License, which can be found in file "LICENSE". -*/ +**/ #include "branchtab.h" #include "cost.h" diff --git a/src/parse.c b/src/parse.c index b6f64918..168eeda3 100644 --- a/src/parse.c +++ b/src/parse.c @@ -182,7 +182,9 @@ void parseParam(char *next, enum ParamType type, if(!ok) { fprintf(stderr,"%s:%d: \"%s\" is not a legal parameter name.\n", __FILE__,__LINE__, name); - fprintf(stderr," input: %s\n", orig); + fprintf(stderr," Legal names consist of a letter followed by" + " letters, digits, and underscores.\n"); + fprintf(stderr," Input: %s\n", orig); exit(EXIT_FAILURE); } From c8854b53a7793c40edd390b8d176840a460a2030 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 23 Jan 2018 11:43:42 -0700 Subject: [PATCH 021/101] More precision in output of diverg.py --- src/diverg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diverg.py b/src/diverg.py index fa5dec3a..6c67c4ac 100755 --- a/src/diverg.py +++ b/src/diverg.py @@ -147,9 +147,9 @@ def openInput(fname): print fmt % (pat1[i], prob1[i]) exit(0) -fmt = "%%%ds %%%ds %%%ds %%6s" % (widpat, wid1, wid2) +fmt = "%%%ds %%%ds %%%ds %%7s" % (widpat, wid1, wid2) print fmt % ("SitePat", fname1, fname2, "KL") -fmt = "%%%ds %%%d.5f %%%d.5f %%6.3f" % (widpat, wid1, wid2) +fmt = "%%%ds %%%d.5f %%%d.5f %%7.4f" % (widpat, wid1, wid2) KLsum = 0.0 for i in range(len(prob1)): From f6411c7908d4e239b741b54e503530f2e33627ac Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Wed, 24 Jan 2018 20:09:09 -0700 Subject: [PATCH 022/101] . --- src/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Makefile b/src/Makefile index 4cd9b99b..12a95ff4 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,7 +1,7 @@ destination := $(HOME)/bin global_destination := $(HOME)/group/bin -opt := -DNDEBUG -O3 -finline-functions # For full optimization -#opt := -O0 -fno-inline-functions -rdynamic -DDEBUG # For debugging +#opt := -DNDEBUG -O3 -finline-functions # For full optimization +opt := -O0 -fno-inline-functions -rdynamic -DDEBUG # For debugging #opt := -O3 -finline-functions # Optimization + debugging #prof := -pg -rdynamic # For profiling prof := From c56327e7fd1982c292a1dc815fccaa5b517ec83d Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Wed, 24 Jan 2018 20:48:25 -0700 Subject: [PATCH 023/101] Fixed bug in scrmpat --- src/scrmpat.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/scrmpat.c b/src/scrmpat.c index 9662dfd3..18f48c74 100644 --- a/src/scrmpat.c +++ b/src/scrmpat.c @@ -410,7 +410,7 @@ int main(int argc, char **argv) { // Used by bootstrap Boot *boot = NULL; int nchr = 0; - unsigned prev, chr=0; + unsigned prev, chr=UINT_MAX; long nsnp[MAXCHR]; memset(nsnp, 0, sizeof nsnp); @@ -484,12 +484,12 @@ int main(int argc, char **argv) { } unsigned long nsites = 0, nbadaa = 0, nfixed = 0; - long snpndx = -1; + long snpndx = -1; // Read data fprintf(stderr, "Doing %s pass through data to tabulate patterns..\n", bootreps > 0 ? "2nd" : "single"); - int chrndx = -1, currChr = INT_MAX; + int chrndx = -1, currChr = INT_MAX; done=0; while(!done) { status = ScrmReader_next(r); @@ -640,6 +640,8 @@ int main(int argc, char **argv) { Boot_free(boot); if(logfile) fclose(logfile); + if(ifp!=stdin) + fclose(ifp); fprintf(stderr, "scrmpat is finished\n"); return 0; } From e7bad4350d637c8f9f8521ee99b305d750805fae Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Wed, 24 Jan 2018 20:51:20 -0700 Subject: [PATCH 024/101] Bump version --- src/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/version.h b/src/version.h index 52c78d78..6c16add6 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.1" +#define VERSION "1.2" #endif From ce44aee1fb55ac266cf80ab00704d7bf228c939e Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Wed, 24 Jan 2018 21:06:55 -0700 Subject: [PATCH 025/101] scrmpat now allows for 2000 chromosomes. It also gives an informative error message if this limit is exceeded. --- src/scrmpat.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/scrmpat.c b/src/scrmpat.c index 18f48c74..36911b74 100644 --- a/src/scrmpat.c +++ b/src/scrmpat.c @@ -135,7 +135,7 @@ Systems Consortium License, which can be found in file "LICENSE". #include #include -#define MAXCHR 24 // maximum number of chromosomes +#define MAXCHR 2000 // maximum number of chromosomes typedef struct Stack Stack; @@ -460,6 +460,11 @@ int main(int argc, char **argv) { prev = chr; chr = ScrmReader_chr(r); if(prev != chr) { + if(nchr >= MAXCHR) { + fprintf(stderr,"%s:%d: too many chromosomes. max=%d\n", + __FILE__,__LINE__, MAXCHR); + exit(EXIT_FAILURE); + } nsnp[nchr] = 1; ++nchr; } else From b2de814f33ea45395f195ff3e0cab655a7e7a04d Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Wed, 24 Jan 2018 21:09:19 -0700 Subject: [PATCH 026/101] Bump version --- src/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/version.h b/src/version.h index 6c16add6..f2cb4570 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.2" +#define VERSION "1.3" #endif From c3b2e4bf09b1aa3d65e10ac7e1b014202f89e97f Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Wed, 24 Jan 2018 21:14:50 -0700 Subject: [PATCH 027/101] Optimization on --- src/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Makefile b/src/Makefile index 12a95ff4..4cd9b99b 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,7 +1,7 @@ destination := $(HOME)/bin global_destination := $(HOME)/group/bin -#opt := -DNDEBUG -O3 -finline-functions # For full optimization -opt := -O0 -fno-inline-functions -rdynamic -DDEBUG # For debugging +opt := -DNDEBUG -O3 -finline-functions # For full optimization +#opt := -O0 -fno-inline-functions -rdynamic -DDEBUG # For debugging #opt := -O3 -finline-functions # Optimization + debugging #prof := -pg -rdynamic # For profiling prof := From 328e88e6cf3da1b61670977039268b47460041dd Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Wed, 24 Jan 2018 21:22:46 -0700 Subject: [PATCH 028/101] In tabpat.c, add code to check bounds of nsnp array. Abort with an error message if the data contains more chromosomes than provided for by the MAXCHR macro. --- src/tabpat.c | 5 +++++ src/version.h | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/tabpat.c b/src/tabpat.c index ecac2417..96c8f665 100644 --- a/src/tabpat.c +++ b/src/tabpat.c @@ -458,6 +458,11 @@ int main(int argc, char **argv) { int diff = strcmp(prev, chr); if(diff != 0) { StrInt_insert(strint, chr, nchr); + if(nchr >= MAXCHR) { + fprintf(stderr,"%s:%d: too many chromosomes. max=%d\n", + __FILE__,__LINE__, MAXCHR); + exit(EXIT_FAILURE); + } nsnp[nchr] = 1; ++nchr; } else diff --git a/src/version.h b/src/version.h index f2cb4570..e3c9c610 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.3" +#define VERSION "1.4" #endif From 9c60a74076b0f1557d630e945bfb6609a3980a2e Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Thu, 25 Jan 2018 02:14:08 -0700 Subject: [PATCH 029/101] Changed from -g to -ggdb in Makefile --- src/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Makefile b/src/Makefile index 4cd9b99b..f3894484 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,7 +1,7 @@ destination := $(HOME)/bin global_destination := $(HOME)/group/bin opt := -DNDEBUG -O3 -finline-functions # For full optimization -#opt := -O0 -fno-inline-functions -rdynamic -DDEBUG # For debugging +#opt := -O0 -fno-inline-functions -rdynamic -DDEBUG # For debugging #opt := -O3 -finline-functions # Optimization + debugging #prof := -pg -rdynamic # For profiling prof := @@ -29,7 +29,7 @@ warn := \ -Wundef \ -Wwrite-strings -CFLAGS := -g -std=gnu99 $(warn) $(incl) $(opt) $(prof) $(osargs) +CFLAGS := -ggdb -std=gnu99 $(warn) $(incl) $(opt) $(prof) $(osargs) lib := -L/usr/local/lib -lgsl -lgslcblas -lpthread -lm From bbe8858795dc0fc862919ff84a16c85f9e48442c Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Thu, 25 Jan 2018 02:15:02 -0700 Subject: [PATCH 030/101] Back to -g --- src/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index f3894484..581de354 100644 --- a/src/Makefile +++ b/src/Makefile @@ -29,7 +29,7 @@ warn := \ -Wundef \ -Wwrite-strings -CFLAGS := -ggdb -std=gnu99 $(warn) $(incl) $(opt) $(prof) $(osargs) +CFLAGS := -g -std=gnu99 $(warn) $(incl) $(opt) $(prof) $(osargs) lib := -L/usr/local/lib -lgsl -lgslcblas -lpthread -lm From ba40d42ca9939b368e60c088597d495c963f2704 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Fri, 26 Jan 2018 09:16:23 -0700 Subject: [PATCH 031/101] Update documentation --- src/README.md | 35 ++++++++++++++++++++++++++--------- src/scrmpat.c | 49 +++++++++++++++++++++++++++++-------------------- 2 files changed, 55 insertions(+), 29 deletions(-) diff --git a/src/README.md b/src/README.md index da334615..0aa01f53 100644 --- a/src/README.md +++ b/src/README.md @@ -6,27 +6,29 @@ Legofit is a computer package that uses counts of nucleotide site patterns to estimate the history of population size, subdivision, and gene flow. The package consists of the following programs -* @ref daf "daf", which writes genetic data into the ".daf" format, +* @ref daf "daf" writes genetic data into the ".daf" format, which is used by @ref tabpat "tabpat". -* @ref tabpat "tabpat", which reads ".daf" files for several +* @ref tabpat "tabpat" reads ".daf" files for several populations, tabulates "nucleotide site patterns" (explained below), and generates moving-blocks bootstrap replicates. -* @ref raf "raf", which writes genetic data into the ".raf" format, +* @ref raf "raf" writes genetic data into the ".raf" format, which is used by @ref sitepat "sitepat". -* @ref sitepat "sitepat", which reads ".raf" files for several +* @ref sitepat "sitepat" reads ".raf" files for several populations, tabulates "nucleotide site patterns" (explained below), and generates moving-blocks bootstrap replicates. -* @ref legosim "legosim", which predicts site pattern counts from +* @ref scrmpat "scrmpat" tabulates site patterns from output generated + by the `scrm` coalescent simulator. +* @ref legosim "legosim" predicts site pattern counts from assumptions about population history. -* @ref legofit "legofit", which estimates parameters from site pattern +* @ref legofit "legofit" estimates parameters from site pattern counts. -* @ref bootci "bootci.py", which uses multiple legofit output files +* @ref bootci "bootci.py" uses multiple legofit output files (one for the real data and one for each bootstrap replicate) to generate bootstrap confidence intervals for estimated parameters. -* @ref flatfile "flatfile.py", which reads a list of legofit output +* @ref flatfile "flatfile.py" reads a list of legofit output files and writes a flat file with a row for each legofit file and a column for each parameter. -* @ref diverg "diverg.py", which compares two distributions of site +* @ref diverg "diverg.py" compares two distributions of site pattern frequencies, using the Kullback-Leibler (KL) divergence. # Nucleotide site patterns {#sitepat} @@ -453,6 +455,21 @@ format is described above. Then, you can execute `legosim` by typing: See the @ref legosim "legosim" documentation for details. +## Simulating site patterns + +The preferred approach is to do simulations using +[scrm](https://scrm.github.io/), a software package written by Paul +R. Staab, Sha Zhu, Dirk Metzler and Gerton Lunter. It does coalescent +simulations with linkage and recombination. Our own program @ref +scrmpat "scrmpat" tabulates site pattern frequencies from `scrm` +output. + +For less sophisticated simulations, use the `-U` option of @ref +legosim "legosim". This assumes free recombination between nucleotide +sites. This is not ideal, because it ignores genetic linkage, but it +reads the same input files as `legosim`, is very fast, and generates +output in the form of site pattern frequencies. + ## Estimating parameters from genetic data This involves several programs. The first step is to generate input diff --git a/src/scrmpat.c b/src/scrmpat.c index 36911b74..1647c3a8 100644 --- a/src/scrmpat.c +++ b/src/scrmpat.c @@ -12,16 +12,11 @@ bootstrap, writing each bootstrap replicate into a separate file. # Usage - Usage: scrmpat [options] ... where and are - arbitrary labels, which refer to the populations in the input - data. The number and order of these labels must agree with - those specified on the scrm command line (using scrm arguments - -I and/or -eI). Labels may not include the character - ":". Maximum number of input files: 32. Writes to standard - output. - - Bootstrap output is available only if input comes from a file - rather than from standard input. + Usage: scrmpat [options] ... + where , , etc. are arbitrary labels, whose number and order + must agree with that of the populations specified in the scrm command + line (using scrm arguments -I and -eI). Labels may not include the + character ":". Writes to standard output. Max number of input files: 32. Options may include: --infile @@ -36,6 +31,8 @@ bootstrap, writing each bootstrap replicate into a separate file. log fixed sites to scrmpat.log -a or --logAll log all sites to scrmpat.log + --version + Print version and exit -h or --help Print this message @@ -46,14 +43,24 @@ should include the option `-transpost-segsites`. Let us assume you have done this, that file `foo.scrm` contains the output simulated by `scrm`, and that these simulated data included genotypes referring to four populations, labeled "x", "y", "n", and "d". The `scrmpat` -command woule look like this: -~/daf contains a separate daf file for each population. We want to -compare 4 populations, whose .daf files are `yri.daf`, `ceu.daf`, -`altai.daf`, and `denisova.daf`. The following command will do this, -putting the results into `obs.txt`. +command would look like this: scrmpat --infile foo.scrm x y n d +`scrmpat`'s notion of a "population" differs from that of `scrm`, in +that `scrmpat` treats samples of different ages as separate +populations, even if they reside in the same population on the `scrm` +command line. For example, consider the following `scrm` command line: + + scrm 3 -I 2 1 1 -eI 0.5 0 1 + +This specifies three haploid samples distributed across two +populations. The `-I` argument says that each population has a +sample at time 0. The `-eI` argument says that, in addition, +population 2 has a sample at time 0.5. All three samples would be +treated as separate populations by `scrmpat`. Thus, the `scrmpat` +command line should list three labels, as in "scrmpat x y z". + In the output, site pattern "x:y" refers to the pattern in which the derived allele is present haploid samples from "x" and "y" but not on those from other populations. The order of @@ -63,6 +70,8 @@ pattern labeled "x:y:d" rather than, say, "y:x:d". The output looks like this: + # scrmpat version 1.3 + # Population labels: x y n d # Number of site patterns: 10 # Tabulated 12327755 SNPs # SitePat E[count] @@ -154,15 +163,15 @@ static void generatePatterns(int bit, int npops, Stack * stk, tipId_t pat, const char *useMsg = "\nUsage: scrmpat [options] ...\n" - " where and are arbitrary labels, whose number and order must\n" - " agree with that of the populations specified in the scrm command.\n" - " Writes to standard output. Labels may not include\n" - " the character \":\"."; + " where , , etc. are arbitrary labels, whose number and order\n" + " must agree with that of the populations specified in the scrm command\n" + " line (using scrm arguments -I and -eI). Labels may not include the\n" + " character \":\". Writes to standard output."; /// Print usage message and die. static void usage(void) { fputs(useMsg, stderr); - fprintf(stderr, " Maximum number of input files: %lu.\n", + fprintf(stderr, " Max number of input files: %lu.\n", 8 * sizeof(tipId_t)); fputs("\nOptions may include:\n", stderr); tellopt("--infile ", From 04b93b9995b4ef5e54ad6a1d735984a9a847a09a Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Fri, 16 Feb 2018 09:22:37 -0700 Subject: [PATCH 032/101] Increased size of input buffer in DAFReader_next and RAFReader_next. --- src/dafreader.c | 2 +- src/rafreader.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dafreader.c b/src/dafreader.c index d3e54c60..abc991d7 100644 --- a/src/dafreader.c +++ b/src/dafreader.c @@ -70,7 +70,7 @@ int DAFReader_next(DAFReader * self) { int ntokens1; int ntokens; int status; - char buff[100]; + char buff[1024]; long unsigned prevnucpos = 0UL; // Find a line of input diff --git a/src/rafreader.c b/src/rafreader.c index 5757a851..8002deb1 100644 --- a/src/rafreader.c +++ b/src/rafreader.c @@ -73,7 +73,7 @@ int iscomment(const char *s) { int RAFReader_next(RAFReader * self) { int ntokens; int status; - char buff[100]; + char buff[1024]; long unsigned prevnucpos = 0UL; // Find a line of input From df209de2e6de9f8ba8a78f39068a87a2e13b4b3b Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Fri, 16 Feb 2018 10:10:04 -0700 Subject: [PATCH 033/101] daf.c now ignores sites at which REF, ALT or ancestral allele consist of more than a single nucleotide. raf.c does the same, except that it doesn't use ancestral allele. rafreader.c and dafreader.c ignore sites at which any of the alleles consist of more or fewer than a single nucleotide. --- src/daf.c | 32 ++++++++++++++++++++++++++++++++ src/dafreader.c | 8 ++++++++ src/raf.c | 24 ++++++++++++++++++++++++ src/rafreader.c | 8 ++++++++ 4 files changed, 72 insertions(+) diff --git a/src/daf.c b/src/daf.c index 08ed67de..489995e6 100644 --- a/src/daf.c +++ b/src/daf.c @@ -74,6 +74,7 @@ int main(int argc, char **argv) { long int zeroref = 0, zeroalt = 0, zeroaa = 0, zerogtype = 0; long int missref = 0, missaa = 0; long int multref = 0, multalt = 0, multaa = 0; + long int indelref=0, indelalt=0, indelaa=0; long int nbad = 0, ngood = 0; int ok; // is current line acceptable long unsigned lastnucpos = 0, nucpos; @@ -214,6 +215,25 @@ int main(int argc, char **argv) { ++nbad; continue; } + + // Skip if ref, alt or aa are indels + if(strlen(ref[0]) != 1) { + ++indelref; + ok=0; + } + if(strlen(alt[0]) != 1) { + ++indelalt; + ok=0; + } + if(strlen(aa[0]) != 1) { + ++indelaa; + ok=0; + } + if(!ok) { + ++nbad; + continue; + } + // Skip if ref or aa are missing. if(0==strcmp(aa[0], ".")) { ++missaa; @@ -328,6 +348,18 @@ int main(int argc, char **argv) { fprintf(stderr, "daf: bad sites with multiple ancestral alleles: %ld\n", multaa); + if(indelref) + fprintf(stderr, + "daf: bad sites with ref allele an indel: %ld\n", + indelref); + if(indelalt) + fprintf(stderr, + "daf: bad sites with alt allele an indel: %ld\n", + indelalt); + if(indelaa) + fprintf(stderr, + "daf: bad sites with ancestral allele an indel: %ld\n", + indelaa); if(missref) fprintf(stderr, "daf: bad sites with missing ref alleles: %ld\n", missref); diff --git a/src/dafreader.c b/src/dafreader.c index abc991d7..72843b30 100644 --- a/src/dafreader.c +++ b/src/dafreader.c @@ -86,6 +86,14 @@ int DAFReader_next(DAFReader * self) { continue; ntokens1 = Tokenizer_split(self->tkz, buff, " "); ntokens = Tokenizer_strip(self->tkz, " \n"); + if( ntokens == 5) { + // ancestral allele must be a single nucleotide + if(1 != strlen(Tokenizer_token(self->tkz, 2))) + continue; + // derived allele must be a single nucleotide + if(1 != strlen(Tokenizer_token(self->tkz, 3))) + continue; + } if(ntokens > 0) break; } diff --git a/src/raf.c b/src/raf.c index 22e7543a..3d52a866 100644 --- a/src/raf.c +++ b/src/raf.c @@ -71,6 +71,7 @@ int main(int argc, char **argv) { long int zeroref = 0, zeroalt = 0, zerogtype = 0; long int missref = 0; long int multref = 0, multalt = 0; + long int indelref=0, indelalt=0; long int nbad = 0, ngood = 0; int ok; // is current line acceptable long unsigned lastnucpos = 0, nucpos; @@ -192,6 +193,21 @@ int main(int argc, char **argv) { ++nbad; continue; } + + // Skip if ref or alt are indels + if(strlen(ref[0]) != 1) { + ++indelref; + ok=0; + } + if(strlen(alt[0]) != 1) { + ++indelalt; + ok=0; + } + if(!ok) { + ++nbad; + continue; + } + // Skip if ref is missing. if(0==strcmp(ref[0], ".")) { ++missref; @@ -272,6 +288,14 @@ int main(int argc, char **argv) { if(multalt) fprintf(stderr, "raf: bad sites with multiple alt alleles: %ld\n", multalt); + if(indelref) + fprintf(stderr, + "raf: bad sites with ref allele an indel: %ld\n", + indelref); + if(indelalt) + fprintf(stderr, + "raf: bad sites with alt allele an indel: %ld\n", + indelalt); if(missref) fprintf(stderr, "raf: bad sites with missing ref alleles: %ld\n", missref); diff --git a/src/rafreader.c b/src/rafreader.c index 8002deb1..928875df 100644 --- a/src/rafreader.c +++ b/src/rafreader.c @@ -91,6 +91,14 @@ int RAFReader_next(RAFReader * self) { continue; Tokenizer_split(self->tkz, buff, "\t"); ntokens = Tokenizer_strip(self->tkz, " \n"); + if( ntokens == 5) { + // reference allele must be a single nucleotide + if(1 != strlen(Tokenizer_token(self->tkz, 2))) + continue; + // alternate allele must be a single nucleotide + if(1 != strlen(Tokenizer_token(self->tkz, 3))) + continue; + } if(ntokens > 0) break; } From 881cce744269be742028b47c43ce7c7f99e9a589 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sat, 17 Feb 2018 13:20:55 -0700 Subject: [PATCH 034/101] More verbose output when tabpat cannot open bootstrap file for output. --- src/tabpat.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/tabpat.c b/src/tabpat.c index 96c8f665..971a05f3 100644 --- a/src/tabpat.c +++ b/src/tabpat.c @@ -619,8 +619,11 @@ int main(int argc, char **argv) { DIE("buffer overflow in snprintf"); FILE *fp = fopen(buff, "w"); - if(fp == NULL) - DIE("bad fopen"); + if(fp == NULL) { + fprintf(stderr,"%s:%d: can't open \"%s\" for output.\n", + __FILE__,__LINE__,buff); + exit(EXIT_FAILURE); + } fprintf(fp, "# %13s %20s", "SitePat", "E[count]\n"); for(i = 0; i < npat; ++i) { fprintf(fp, "%15s %20.7lf\n", From 5400489cc7416b3cc9ebbbe310022511a1839284 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sat, 17 Feb 2018 13:22:35 -0700 Subject: [PATCH 035/101] More verbose output when sitepat fails to open bootstrap file for output. --- src/sitepat.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/sitepat.c b/src/sitepat.c index 47542c2e..43fc8198 100644 --- a/src/sitepat.c +++ b/src/sitepat.c @@ -637,8 +637,11 @@ int main(int argc, char **argv) { DIE("buffer overflow in snprintf"); FILE *fp = fopen(buff, "w"); - if(fp == NULL) - DIE("bad fopen"); + if(fp == NULL) { + fprintf(stderr,"%s:%d: can't open \"%s\" for output.\n", + __FILE__,__LINE__,buff); + exit(EXIT_FAILURE); + } fprintf(fp, "# %13s %20s", "SitePat", "E[count]\n"); for(i = 0; i < npat; ++i) { fprintf(fp, "%15s %20.7lf\n", From c132ce95cea20ef4f2285c97f0e25fb7b7ed96b8 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 20 Feb 2018 14:37:13 -0700 Subject: [PATCH 036/101] . --- src/boot.c | 116 ++++++++++++++++++++++++++--------------------------- 1 file changed, 58 insertions(+), 58 deletions(-) diff --git a/src/boot.c b/src/boot.c index 29f2f6fc..05a98daf 100644 --- a/src/boot.c +++ b/src/boot.c @@ -1,3 +1,4 @@ + /** * @file boot.c * @author Alan R. Rogers @@ -18,27 +19,27 @@ /// Contains the all data involved in a moving blocks bootstrap of /// a single chromosome. struct BootChr { - long blocksize; ///< number of SNPs per block - long nrep; ///< number of bootstrap replicates - long nsnp; ///< number of snps - long nblock; ///< number of blocks - int npat; ///< number of site patterns - double **count; ///< count[i][j]: j'th site pattern in i'th rep - long **start; ///< start[i][j] = start of j'th block in i'th rep + long blocksize; ///< number of SNPs per block + long nrep; ///< number of bootstrap replicates + long nsnp; ///< number of snps + long nblock; ///< number of blocks + int npat; ///< number of site patterns + double **count; ///< count[i][j]: j'th site pattern in i'th rep + long **start; ///< start[i][j] = start of j'th block in i'th rep }; /// An array of BootChr pointers. struct Boot { - int nchr; ///< number of chromosomes - BootChr **bc; ///< bc[i]: bootstrap for i'th chromosome + int nchr; ///< number of chromosomes + BootChr **bc; ///< bc[i]: bootstrap for i'th chromosome }; /// Contains the data for a bootstrap confidence interval. struct BootConf { - long nrep; ///< repetitions - long blocksize; ///< nucleotide positions per block - double confidence; ///< size of confidence region - double *low, *high; ///< confidence bounds + long nrep; ///< repetitions + long blocksize; ///< nucleotide positions per block + double confidence; ///< size of confidence region + double *low, *high; ///< confidence bounds }; long LInt_div_round(long num, long denom); @@ -47,23 +48,23 @@ static void BootChr_allocArrays(BootChr * self); /// Divide num by denom and round the result to the nearest integer. /// @return an structure of type ldiv_t. long LInt_div_round(long num, long denom) { - assert(denom != 0L); - ldiv_t quotrem = ldiv(num, denom); - if(2L * quotrem.rem > denom) - return 1L + quotrem.quot; - return quotrem.quot; + assert(denom != 0L); + ldiv_t quotrem = ldiv(num, denom); + if(2L * quotrem.rem > denom) + return 1L + quotrem.quot; + return quotrem.quot; } /// Return a blocksize that is as close as possible to lengthWanted /// while still making length*nblock close to nsnp. long adjustBlockLength(long lengthWanted, int nsnp) { - long nblock = LInt_div_round(nsnp, lengthWanted); - return LInt_div_round(nsnp, nblock); + long nblock = LInt_div_round(nsnp, lengthWanted); + return LInt_div_round(nsnp, nblock); } /// Constructor for class BootChr. -BootChr *BootChr_new(long nsnp, long nrep, int npat, long blocksize, - gsl_rng * rng) { +BootChr *BootChr_new(long nsnp, long nrep, int npat, long blocksize, + gsl_rng * rng) { long i, j; assert(blocksize > 0); if(nrep == 0) @@ -79,12 +80,12 @@ BootChr *BootChr_new(long nsnp, long nrep, int npat, long blocksize, exit(1); } - BootChr *self = malloc(sizeof(BootChr)); + BootChr *self = malloc(sizeof(BootChr)); CHECKMEM(self); self->nsnp = nsnp; self->nrep = nrep; - self->blocksize = adjustBlockLength(blocksize, nsnp); + self->blocksize = adjustBlockLength(blocksize, nsnp); self->npat = npat; self->nblock = LInt_div_round(nsnp, blocksize); @@ -111,7 +112,7 @@ BootChr *BootChr_new(long nsnp, long nrep, int npat, long blocksize, /// Allocate BootChr's arrays. static void BootChr_allocArrays(BootChr * self) { - long i; + long i; self->start = calloc((unsigned long) self->nrep, sizeof(self->start[0])); CHECKMEM(self->start); @@ -132,7 +133,7 @@ static void BootChr_allocArrays(BootChr * self) { #ifndef NDEBUG void BootChr_sanityCheck(const BootChr * self, const char *file, int line) { - long i, j; + long i, j; REQUIRE(self->blocksize > 0, file, line); REQUIRE(self->blocksize < 100000, file, line); REQUIRE(self != NULL, file, line); @@ -145,7 +146,7 @@ void BootChr_sanityCheck(const BootChr * self, const char *file, int line) { REQUIRE(self->start != NULL, file, line); unsigned long endpos = self->nsnp - self->blocksize + 1; - long prev; + long prev; for(i = 0; i < self->nrep; ++i) { REQUIRE(self->count[i] != NULL, file, line); @@ -166,7 +167,7 @@ void BootChr_sanityCheck(const BootChr * self, const char *file, int line) { /// How many copies of snp with index snpndx are present in a given /// repetition (rep)? long BootChr_multiplicity(const BootChr * self, long snpndx, long rep) { - long lndx, hndx, lowtarget; + long lndx, hndx, lowtarget; assert(snpndx < self->nsnp); @@ -185,7 +186,7 @@ long BootChr_multiplicity(const BootChr * self, long snpndx, long rep) { hndx += lndx; assert(hndx == 0 - || self->start[rep][hndx - 1] - snpndx < self->blocksize); + || self->start[rep][hndx - 1] - snpndx < self->blocksize); return hndx - lndx; } @@ -200,8 +201,8 @@ long BootChr_multiplicity(const BootChr * self, long snpndx, long rep) { void BootChr_add(BootChr * self, long snpndx, int pat, double z) { assert(pat < self->npat); assert(snpndx < self->nsnp); - if(!(z >= 0)) - fprintf(stderr,"%s:%s:%d: z=%lf\n", __FILE__,__func__,__LINE__,z); + if(!(z >= 0)) + fprintf(stderr, "%s:%s:%d: z=%lf\n", __FILE__, __func__, __LINE__, z); assert(z >= 0.0); for(register int rep = 0; rep < self->nrep; ++rep) { @@ -209,7 +210,7 @@ void BootChr_add(BootChr * self, long snpndx, int pat, double z) { // in the current bootstrap replicate. register long w = BootChr_multiplicity(self, snpndx, rep); - self->count[rep][pat] += w*z; + self->count[rep][pat] += w * z; } } @@ -264,20 +265,20 @@ void BootChr_aggregate(BootChr * self, int rep, int npat, double count[npat]) { assert(self); assert(npat == self->npat); int j; - for(j=0; j < self->npat; ++j) + for(j = 0; j < self->npat; ++j) count[j] += self->count[rep][j]; } /// Constructor for class Boot. -Boot * Boot_new(int nchr, long nsnp[nchr], long nrep, int npat, - long blocksize, gsl_rng *rng) { +Boot *Boot_new(int nchr, long nsnp[nchr], long nrep, int npat, + long blocksize, gsl_rng * rng) { Boot *self = malloc(sizeof(Boot)); CHECKMEM(self); self->nchr = nchr; self->bc = calloc(nchr, sizeof(BootChr *)); CHECKMEM(self->bc); - for(int i=0; i < nchr; ++i) { + for(int i = 0; i < nchr; ++i) { self->bc[i] = BootChr_new(nsnp[i], nrep, npat, blocksize, rng); CHECKMEM(self->bc[i]); } @@ -285,8 +286,8 @@ Boot * Boot_new(int nchr, long nsnp[nchr], long nrep, int npat, } /// Destructor for class Boot. -void Boot_free(Boot *self) { - for(int i=0; i < self->nchr; ++i) +void Boot_free(Boot * self) { + for(int i = 0; i < self->nchr; ++i) BootChr_free(self->bc[i]); free(self->bc); free(self); @@ -300,7 +301,7 @@ void Boot_free(Boot *self) { * @param [in] pat The index of the current site pattern. * @param [in] z the contribution of the snp to the site pattern. */ -void Boot_add(Boot *self, int chr, long snpndx, int pat, double z) { +void Boot_add(Boot * self, int chr, long snpndx, int pat, double z) { BootChr_add(self->bc[chr], snpndx, pat, z); } @@ -312,24 +313,23 @@ void Boot_add(Boot *self, int chr, long snpndx, int pat, double z) { /// @param [out] count An array of doubles. The function will add to /// count[i] the contribution of site pattern i in bootstrap replicate /// rep. -void Boot_aggregate(Boot * self, int rep, int npat, - double count[npat]) { - int i; +void Boot_aggregate(Boot * self, int rep, int npat, double count[npat]) { + int i; #ifndef NDEBUG - for(i=0; inchr; ++i) + for(i = 0; i < self->nchr; ++i) BootChr_aggregate(self->bc[i], rep, npat, count); } #ifndef NDEBUG void Boot_sanityCheck(const Boot * self, const char *file, int line) { - for(int i=0; i < self->nchr; ++i) + for(int i = 0; i < self->nchr; ++i) BootChr_sanityCheck(self->bc[i], file, line); } #endif @@ -339,9 +339,9 @@ void Boot_sanityCheck(const Boot * self, const char *file, int line) { double interpolate(double p, double *v, long len) { if(len == 0) return strtod("NAN", 0); - long i, j; - double w; - double goal = p * (len - 1); + long i, j; + double w; + double goal = p * (len - 1); i = floor(goal); j = ceil(goal); @@ -379,7 +379,7 @@ double interpolate(double p, double *v, long len) { */ void confidenceBounds(double *lowBnd, double *highBnd, double confidence, long len, double v[len]) { - double tailProb = (1.0 - confidence) / 2.0; + double tailProb = (1.0 - confidence) / 2.0; qsort(v, (size_t) len, sizeof(v[0]), compareDoubles); *lowBnd = interpolate(tailProb, v, len); @@ -388,7 +388,7 @@ void confidenceBounds(double *lowBnd, double *highBnd, double confidence, /// Print a BootChr object void BootChr_print(const BootChr * self, FILE * ofp) { - long rep, j; + long rep, j; fprintf(ofp, "BootChr_print: nsnp=%ld nrep=%ld blocksize=%ld nblock=%ld\n", @@ -412,12 +412,13 @@ void BootChr_print(const BootChr * self, FILE * ofp) { } #ifndef NDEBUG + /** For debugging BootChr_multiplicity */ unsigned BootChr_multiplicity_slow(BootChr * self, long snp, long rep) { - unsigned i, n = 0; + unsigned i, n = 0; for(i = 0; i < self->nblock; ++i) { - long distance = snp - self->start[rep][i]; + long distance = snp - self->start[rep][i]; if(distance < 0) break; @@ -427,4 +428,3 @@ unsigned BootChr_multiplicity_slow(BootChr * self, long snp, long rep) { return n; } #endif - From 8801b013cbaa5808a91452d61ae20bb9af6cfbe1 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 20 Feb 2018 14:45:17 -0700 Subject: [PATCH 037/101] . --- src/boot.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/boot.c b/src/boot.c index 05a98daf..76e8fad3 100644 --- a/src/boot.c +++ b/src/boot.c @@ -46,7 +46,6 @@ long LInt_div_round(long num, long denom); static void BootChr_allocArrays(BootChr * self); /// Divide num by denom and round the result to the nearest integer. -/// @return an structure of type ldiv_t. long LInt_div_round(long num, long denom) { assert(denom != 0L); ldiv_t quotrem = ldiv(num, denom); @@ -89,7 +88,7 @@ BootChr *BootChr_new(long nsnp, long nrep, int npat, long blocksize, self->npat = npat; self->nblock = LInt_div_round(nsnp, blocksize); - // Block positions are uniform on [0, nsnp-blocksize+1). + // Block start positions are uniform on [0, nsnp-blocksize+1). unsigned long endpos; endpos = nsnp - self->blocksize + 1; From f22bcfb6203eeb95d343f703bfaaeeeb6ca3b45c Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 20 Feb 2018 16:35:33 -0700 Subject: [PATCH 038/101] . --- src/popnode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/popnode.c b/src/popnode.c index 00af296b..a3754839 100644 --- a/src/popnode.c +++ b/src/popnode.c @@ -344,10 +344,10 @@ void PopNode_mix(PopNode * child, double *mPtr, bool mixFree, PopNode_sanityCheck(native, __FILE__, __LINE__); } -/// PopNode constructor. Allocates a new Gene and puts it into -/// the array within PopNode. The gene isn't owned by PopNode, -/// however. It will eventually be freed by a recursive call to -/// Gene_free, which will free the root Gene and all descendants. +/// Allocates a new Gene and puts it into the array within +/// PopNode. The gene isn't owned by PopNode, however. It will +/// eventually be freed by a recursive call to Gene_free, which will +/// free the root Gene and all descendants. void PopNode_newGene(PopNode * self, unsigned ndx) { assert(1 + self->nsamples < MAXSAMP); assert(ndx < 8 * sizeof(tipId_t)); From b89aaa895f5f0d6ca85781ac0d95ef37b4cb102c Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Thu, 22 Feb 2018 09:00:49 -0700 Subject: [PATCH 039/101] Legofit now prints out cost and parameter vector for each point at end of optimization. These lines all begin with "@", to make it easy to "grep" them. --- src/diffev.c | 16 ++++++++++------ src/input.lgo | 4 ++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/diffev.c b/src/diffev.c index f96067b9..e63ad440 100644 --- a/src/diffev.c +++ b/src/diffev.c @@ -551,11 +551,11 @@ static inline void TaskArg_setArray(TaskArg * self, int dim, double v[dim]) { void printState(int nPts, int nPar, double par[nPts][nPar], double cost[nPts], int imin, FILE *fp) { int i, j; - fprintf(fp,"%10s %s...\n", "cost", "param values"); + fprintf(fp,"# %-12s %s...\n", "cost", "param values"); for(i=0; i < nPts; ++i) { - fprintf(fp, "%10.6lf", cost[i]); + fprintf(fp, "@ %12.10lf", cost[i]); for(j=0; j < nPar; ++j) - fprintf(fp, " %lf", par[i][j]); + fprintf(fp, " %0.10lf", par[i][j]); if(i == imin) fprintf(fp, " <- best"); putchar('\n'); @@ -699,7 +699,7 @@ int diffev(int dim, double estimate[dim], double *loCost, double *yspread, " No initial points have finite values.\n" " Try increasing simulation replicates in stage %d.\n" " Current value: simReps=%ld\n" - " See -S argument to legofit.\n", + " See -S argument of legofit.\n", __FILE__,__LINE__, stage, SimSched_getSimReps(simSched)); exit(EXIT_FAILURE); @@ -737,8 +737,8 @@ int diffev(int dim, double estimate[dim], double *loCost, double *yspread, // accept mutation cost[i] = trial_cost; assignd(dim, (*pnew)[i], targ[i]->v); - if(trial_cost < cmin) { // Was this a new minimum? If so, - cmin = trial_cost; // reset cmin to new low. + if(trial_cost < cmin) { // New minimum. + cmin = trial_cost; // Reset cmin and imin. imin = i; assignd(dim, best, targ[i]->v); } @@ -787,6 +787,10 @@ int diffev(int dim, double estimate[dim], double *loCost, double *yspread, } } +#if 1 + // For each point, print cost and parameter vector + printState(nPts, dim, *pold, cost, imin, stdout); +#endif JobQueue_noMoreJobs(jq); if(*yspread <= dep.ytol) { status = 0; diff --git a/src/input.lgo b/src/input.lgo index ff75e72b..249ee57b 100644 --- a/src/input.lgo +++ b/src/input.lgo @@ -11,8 +11,8 @@ twoN free twoNn=1e3 # archaic population size # by "+" signs, so negative terms must be entered as shown below. # To spread a constraint function across several lines, break # the line after a "+" symbol. -#twoN constrained twoNxy=1e4 - 1.2*Txy # early modern population size -twoN constrained twoNxy=exp(4 - 1.2*log(Txy)) # early modern population size +twoN constrained twoNxy=1e4 - 1.2*Txy # early modern population size +#twoN constrained twoNxy=exp(4 - 1.2*log(Txy)) # early modern population size mixFrac free mN=0.02 # Neanderthal admixture into y segment x t=zero twoN=one samples=1 # Africa segment y t=zero twoN=one samples=1 # Eurasia From 06c4586b589e1e9def820e06b20b60f491e82718 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Thu, 22 Feb 2018 10:30:18 -0700 Subject: [PATCH 040/101] Changed printState function in diffev.c so that current optimal state is printed first. --- src/diffev.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/diffev.c b/src/diffev.c index e63ad440..25f25e38 100644 --- a/src/diffev.c +++ b/src/diffev.c @@ -547,17 +547,21 @@ static inline void TaskArg_setArray(TaskArg * self, int dim, double v[dim]) { self->cost = -1.0; } -/// Print current state +/// Print current state. Current optimal point is printed first void printState(int nPts, int nPar, double par[nPts][nPar], double cost[nPts], int imin, FILE *fp) { int i, j; fprintf(fp,"# %-12s %s...\n", "cost", "param values"); + fprintf(fp, "@ %12.10lf", cost[imin]); + for(j=0; j < nPar; ++j) + fprintf(fp, " %0.10lf", par[imin][j]); + putchar('\n'); for(i=0; i < nPts; ++i) { + if(i == imin) + continue; fprintf(fp, "@ %12.10lf", cost[i]); for(j=0; j < nPar; ++j) fprintf(fp, " %0.10lf", par[i][j]); - if(i == imin) - fprintf(fp, " <- best"); putchar('\n'); } } From 959b1037f6a2c72a56e4bbd4ab613a296841a480 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Thu, 22 Feb 2018 10:37:01 -0700 Subject: [PATCH 041/101] Fixed bug. Previous code had omitted a linefeed in the output. --- src/diffev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/diffev.c b/src/diffev.c index 25f25e38..d44b921b 100644 --- a/src/diffev.c +++ b/src/diffev.c @@ -552,10 +552,14 @@ void printState(int nPts, int nPar, double par[nPts][nPar], double cost[nPts], int imin, FILE *fp) { int i, j; fprintf(fp,"# %-12s %s...\n", "cost", "param values"); + + // print current optimum fprintf(fp, "@ %12.10lf", cost[imin]); for(j=0; j < nPar; ++j) fprintf(fp, " %0.10lf", par[imin][j]); putchar('\n'); + + // print everything else for(i=0; i < nPts; ++i) { if(i == imin) continue; From 344ebb75824ab5361118f0af08c503beb9f1d60e Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Thu, 22 Feb 2018 12:49:53 -0700 Subject: [PATCH 042/101] Legofit now has a new option, "--stateFile", which can be used to define an output file. If this option is used, the final state of the optimizer will be written into this file. Otherwise, the final state of the optimizer is not written. --- src/diffev.c | 13 +++++++------ src/diffev.h | 1 + src/legofit.c | 48 ++++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 48 insertions(+), 14 deletions(-) diff --git a/src/diffev.c b/src/diffev.c index d44b921b..a711da71 100644 --- a/src/diffev.c +++ b/src/diffev.c @@ -551,22 +551,22 @@ static inline void TaskArg_setArray(TaskArg * self, int dim, double v[dim]) { void printState(int nPts, int nPar, double par[nPts][nPar], double cost[nPts], int imin, FILE *fp) { int i, j; - fprintf(fp,"# %-12s %s...\n", "cost", "param values"); + fprintf(fp,"# %-10s %s...\n", "cost", "param values"); // print current optimum - fprintf(fp, "@ %12.10lf", cost[imin]); + fprintf(fp, "%12.10lf", cost[imin]); for(j=0; j < nPar; ++j) fprintf(fp, " %0.10lf", par[imin][j]); - putchar('\n'); + putc('\n', fp); // print everything else for(i=0; i < nPts; ++i) { if(i == imin) continue; - fprintf(fp, "@ %12.10lf", cost[i]); + fprintf(fp, "%12.10lf", cost[i]); for(j=0; j < nPar; ++j) fprintf(fp, " %0.10lf", par[i][j]); - putchar('\n'); + putc('\n', fp); } } @@ -797,7 +797,8 @@ int diffev(int dim, double estimate[dim], double *loCost, double *yspread, #if 1 // For each point, print cost and parameter vector - printState(nPts, dim, *pold, cost, imin, stdout); + if(dep.stateFile) + printState(nPts, dim, *pold, cost, imin, dep.stateFile); #endif JobQueue_noMoreJobs(jq); if(*yspread <= dep.ytol) { diff --git a/src/diffev.h b/src/diffev.h index c64aa294..ba948a11 100644 --- a/src/diffev.h +++ b/src/diffev.h @@ -33,6 +33,7 @@ struct DiffEvPar { void (*ThreadState_free) (void *); void *initData; void (*initialize)(int, void *, int, double *, gsl_rng *rng); + FILE *stateFile; }; int diffev(int dim, double estimate[dim], double *loCost, diff --git a/src/legofit.c b/src/legofit.c index 225864b3..05af6521 100644 --- a/src/legofit.c +++ b/src/legofit.c @@ -24,6 +24,8 @@ of separations and of episodes of gene flow, and levels of gene flow. add stage with generations and simulation reps -p or --ptsPerDim number of DE points per free var + --stateFile + write final state of optimizer to file -1 or --singletons Use singleton site patterns -v or --verbose @@ -91,14 +93,14 @@ values had changed in a fixed number of iterations. That criterion was used in our recent paper, "Early history of Neanderthals and Denisovans", which was just published in PNAS. -I began to notice convergence problems with models larger than -those used in the PNAS paper. All bootstrap replicates would report -convergence, but some yielded wild parameter estimates. In these -outliers, the spread of objective function values was also very +I began to notice convergence problems with models larger than those +used in the August 2017 PNAS paper. All bootstrap replicates would +report convergence, but some yielded wild parameter estimates. In +these outliers, the spread of objective function values was also very large, indicating that the algorithm had not really converged. So I implemented a new convergence criterion, based on the spread of -objective function values. The iterations terminate when this -spread falls to a pre-determined value. +objective function values. The iterations terminate when this spread +falls to a pre-determined value. This new convergence criterion works best with the KL (Kullback-Leibler) cost function. Minimizing KL is the same as @@ -126,7 +128,14 @@ Second, you can relax the tolerance. By default, this is 1e-4. It is reported in the legofit output. To double this value, use "-T 2e-4" or "--tol 2e-4". -@copyright Copyright (c) 2016, 2017, Alan R. Rogers +The option "--stateFile" is used to define an output file for the +final state of the optimizer. This output file contains a row for each +point in the swarm of points maintained by diffev.c. In each row, the +first entry is the value of the cost function at that point. The +remaining entries give the parameter values in the same order in which +they are printed by legofit. + +@copyright Copyright (c) 2016, 2017, 2018, Alan R. Rogers . This file is released under the Internet Systems Consortium License, which can be found in file "LICENSE". **/ @@ -203,6 +212,7 @@ void usage(void) { tellopt("-S @ or --stage @", "add stage with generations and simulation reps"); tellopt("-p or --ptsPerDim ", "number of DE points per free var"); + tellopt("--stateFile ", "write final state of optimizer to file"); tellopt("-1 or --singletons", "Use singleton site patterns"); tellopt("-v or --verbose", "verbose output"); tellopt("--version", "Print version and exit"); @@ -247,6 +257,7 @@ int main(int argc, char **argv) { {"genomeSize", required_argument, 0, 'n'}, #endif {"singletons", no_argument, 0, '1'}, + {"stateFile", required_argument, 0, 'y'}, {"help", no_argument, 0, 'h'}, {"verbose", no_argument, 0, 'v'}, {"version", no_argument, 0, 'V'}, @@ -266,6 +277,8 @@ int main(int argc, char **argv) { long simreps = 1000000; char lgofname[200] = { '\0' }; char patfname[200] = { '\0' }; + char statefname[200] = { '\0' }; + FILE *stateFile = NULL; // DiffEv parameters double F = 0.9; @@ -362,6 +375,20 @@ int main(int argc, char **argv) { nnuc = strtol(optarg, NULL, 10); break; #endif + case 'y': + status=snprintf(statefname, sizeof(statefname), "%s", optarg); + if(status >= sizeof(statefname)) { + fprintf(stderr,"%s:%d: buffer overflow\n", + __FILE__,__LINE__); + exit(EXIT_FAILURE); + } + stateFile = fopen(statefname, "w"); + if(stateFile==NULL) { + fprintf(stderr,"%s:%d: can't open \"%s\" for output.\n", + __FILE__,__LINE__, statefname); + exit(EXIT_FAILURE); + } + break; case '1': doSing=1; break; @@ -438,6 +465,8 @@ int main(int argc, char **argv) { printf("# lgo input file : %s\n", lgofname); printf("# site pat input file: %s\n", patfname); printf("# pts/dimension : %d\n", ptsPerDim); + if(stateFile) + printf("# output state file : %s\n", statefname); #if COST!=KL_COST && COST!=LNL_COST printf("# mut_rate/generation: %lg\n", u); printf("# nucleotides/genome : %ld\n", nnuc); @@ -515,7 +544,8 @@ int main(int argc, char **argv) { .initData = gptree, .initialize = initStateVec, .simSched = simSched, - .ytol = ytol + .ytol = ytol, + .stateFile = stateFile }; double estimate[dim]; @@ -585,6 +615,8 @@ int main(int argc, char **argv) { GPTree_free(gptree); SimSched_free(simSched); fprintf(stderr,"legofit is finished\n"); + if(stateFile) + fclose(stateFile); return 0; } From bd4342f9939de38e4008efcab1325761be167874 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Fri, 23 Feb 2018 14:44:22 -0700 Subject: [PATCH 043/101] . --- src/boot.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/boot.c b/src/boot.c index 76e8fad3..bdec1421 100644 --- a/src/boot.c +++ b/src/boot.c @@ -1,4 +1,3 @@ - /** * @file boot.c * @author Alan R. Rogers From 73805b6702e8827e64a82aeefa671239c8407d5d Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sat, 24 Feb 2018 05:42:28 -0700 Subject: [PATCH 044/101] Added comment --- src/boot.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/boot.c b/src/boot.c index bdec1421..c93566aa 100644 --- a/src/boot.c +++ b/src/boot.c @@ -179,6 +179,10 @@ long BootChr_multiplicity(const BootChr * self, long snpndx, long rep) { assert(snpndx - self->start[rep][lndx] < self->blocksize); // hndx is index of first block not containing snp + // First line below searches the sub-array beginning with + // entry lndx. This returns an index into the sub-array. + // The second line adds lndx to generate an index into the full + // array. hndx = long_first_geq(snpndx + 1, self->start[rep] + lndx, self->nblock - lndx); hndx += lndx; From 2db2b6f19b37ea771fb40e17b74930f3d753c54a Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sun, 25 Feb 2018 21:56:27 -0700 Subject: [PATCH 045/101] Add state.c --- src/state.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 src/state.c diff --git a/src/state.c b/src/state.c new file mode 100644 index 00000000..25f2bb21 --- /dev/null +++ b/src/state.c @@ -0,0 +1,37 @@ +#include "state.h" +#include +#include + +typedef struct State State; + +struct State { + int npts, npar; // numbers of points and parameters + double **s; // s[i][j]=value of j'th param at i'th point +}; + +// Allocate a new State object. There is no point in optimizing +// this, because State is used only at the beginning and end +// of Legofit. +State *State_new(int npts, int npar) { + int i; + State *self = malloc(sizeof(State)); + CHECKMEM(self); + self->npts = npts; + self->npar = npar; + self->s = malloc(npts * sizeof(self->s[0])); + CHECKMEM(self->s); + for(i=0; i < npar; ++i) { + self->s[i] = malloc(npar * sizeof(self->s[0][0])); + CHECKMEM(self->s[i]); + } + return self; +} + +void State_free(State *self) { + assert(self); + int i; + for(i=0; i < self->npts; ++i) + free(self->s[i]); + free(self->s); + free(self); +} From 7e109e6821b13868a9ea992496abe942dcf05eb4 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Mon, 26 Feb 2018 16:28:00 -0700 Subject: [PATCH 046/101] . --- src/parkeyval.c | 86 +++++++++++++++++++++++-------------------------- 1 file changed, 41 insertions(+), 45 deletions(-) diff --git a/src/parkeyval.c b/src/parkeyval.c index d91859c3..3d3d1e0a 100644 --- a/src/parkeyval.c +++ b/src/parkeyval.c @@ -1,3 +1,4 @@ + /** * @file parkeyval.c * @author Alan R. Rogers @@ -17,28 +18,26 @@ #define MAX_PARAM_NAME 100 struct ParKeyVal { - char key[MAX_PARAM_NAME]; - double *valPtr; // not locally owned - ParamStatus pstat; - ParKeyVal *next; + char key[MAX_PARAM_NAME]; + double *valPtr; // not locally owned + ParamStatus pstat; + ParKeyVal *next; }; -ParKeyVal *ParKeyVal_new(const char *key, double *vptr, - ParamStatus pstat, - ParKeyVal *next); +ParKeyVal *ParKeyVal_new(const char *key, double *vptr, + ParamStatus pstat, ParKeyVal * next); /// Constructor. Call with next=NULL to terminate linked list. -ParKeyVal *ParKeyVal_new(const char *key, double *vptr, - ParamStatus pstat, - ParKeyVal * next) { +ParKeyVal *ParKeyVal_new(const char *key, double *vptr, + ParamStatus pstat, ParKeyVal * next) { if(strlen(key) >= MAX_PARAM_NAME) eprintf("%s:%s:%d: Parameter name too long. Max=%d.\n", __FILE__, __func__, __LINE__, MAX_PARAM_NAME); - ParKeyVal *self = malloc(sizeof(ParKeyVal)); + ParKeyVal *self = malloc(sizeof(ParKeyVal)); CHECKMEM(self); snprintf(self->key, sizeof(self->key), "%s", key); self->valPtr = vptr; - self->pstat = pstat; + self->pstat = pstat; self->next = next; ParKeyVal_sanityCheck(self, __FILE__, __LINE__); return self; @@ -53,12 +52,12 @@ void ParKeyVal_free(ParKeyVal * self) { } /// Insert a new key/pointer pair into sorted linked list. -ParKeyVal *ParKeyVal_add(ParKeyVal * self, const char *key, - double *vptr, ParamStatus pstat) { +ParKeyVal *ParKeyVal_add(ParKeyVal * self, const char *key, + double *vptr, ParamStatus pstat) { if(self == NULL) return ParKeyVal_new(key, vptr, pstat, NULL); - int i = strcmp(key, self->key); + int i = strcmp(key, self->key); if(i < 0) return ParKeyVal_new(key, vptr, pstat, self); else if(i == 0) @@ -72,29 +71,29 @@ ParKeyVal *ParKeyVal_add(ParKeyVal * self, const char *key, /// Find key in linked list. On success, return pointer corresponding /// to key. On failure, return NULL. -double *ParKeyVal_get(ParKeyVal * self, ParamStatus *pstat, const char *key) { +double *ParKeyVal_get(ParKeyVal * self, ParamStatus * pstat, const char *key) { if(self == NULL) - return NULL; + return NULL; - int i = strcmp(key, self->key); - if(i < 0) // Failed - return NULL; + int i = strcmp(key, self->key); + if(i < 0) // Failed + return NULL; - if(i == 0) { // Success - *pstat = self->pstat; - return self->valPtr; - } + if(i == 0) { // Success + *pstat = self->pstat; + return self->valPtr; + } return ParKeyVal_get(self->next, pstat, key); } /// Print a ParKeyVal -void ParKeyVal_print(ParKeyVal *self, FILE *fp) { - if(self == NULL) - fprintf(fp,"NULL\n"); - else { - fprintf(fp,"[%p:%s,%p,", self, self->key, self->valPtr); - switch(self->pstat) { +void ParKeyVal_print(ParKeyVal * self, FILE * fp) { + if(self == NULL) + fprintf(fp, "NULL\n"); + else { + fprintf(fp, "[%p:%s,%p,", self, self->key, self->valPtr); + switch (self->pstat) { case Free: fputs("Free", fp); break; @@ -108,20 +107,19 @@ void ParKeyVal_print(ParKeyVal *self, FILE *fp) { fputs("Constrained", fp); break; default: - fprintf(stderr,"%s:%d: Unknown ParamStat value: %d.\n", - __FILE__,__LINE__,self->pstat); + fprintf(stderr, "%s:%d: Unknown ParamStat value: %d.\n", + __FILE__, __LINE__, self->pstat); exit(EXIT_FAILURE); } fputs("]->", fp); - ParKeyVal_print(self->next, fp); - } + ParKeyVal_print(self->next, fp); + } } /// Abort if ParKeyVal fails tests -void ParKeyVal_sanityCheck(ParKeyVal *self, const char *file, - int line) { +void ParKeyVal_sanityCheck(ParKeyVal * self, const char *file, int line) { #ifndef NDEBUG - if(self==NULL) + if(self == NULL) return; REQUIRE(self->valPtr != NULL, file, line); ParKeyVal_sanityCheck(self->next, file, line); @@ -129,7 +127,7 @@ void ParKeyVal_sanityCheck(ParKeyVal *self, const char *file, } /// Return 1 if the two linked lists are equal; 0 if they differ. -int ParKeyVal_equals(ParKeyVal *lhs, ParKeyVal *rhs) { +int ParKeyVal_equals(ParKeyVal * lhs, ParKeyVal * rhs) { if(lhs == NULL && rhs == NULL) return 1; if(lhs == NULL && rhs != NULL) { @@ -150,12 +148,12 @@ int ParKeyVal_equals(ParKeyVal *lhs, ParKeyVal *rhs) { } if(lhs->valPtr != NULL && rhs->valPtr != NULL) { if(!Dbl_equals_allowNonfinite(*lhs->valPtr, *rhs->valPtr)) { - return 0; + return 0; } - if(lhs->pstat != rhs->pstat) { - return 0; + if(lhs->pstat != rhs->pstat) { + return 0; } - } + } return ParKeyVal_equals(lhs->next, rhs->next); } @@ -163,8 +161,6 @@ int ParKeyVal_equals(ParKeyVal *lhs, ParKeyVal *rhs) { int legalName(const char *name) { const char *legal = "abcdefghijklmnopqrstuvwxyz" - "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "0123456789" - "._:@$"; + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "0123456789" "._:@$"; return strlen(name) == strspn(name, legal); } From 68dc604492dd82c23b8e8f8920403564796dd7c0 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Fri, 2 Mar 2018 14:10:49 -0700 Subject: [PATCH 047/101] Add state.h. Initialize imin at beginning of diffev. --- src/diffev.c | 4 ++-- src/state.h | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) create mode 100644 src/state.h diff --git a/src/diffev.c b/src/diffev.c index a711da71..b8040ab2 100644 --- a/src/diffev.c +++ b/src/diffev.c @@ -621,7 +621,7 @@ int diffev(int dim, double estimate[dim], double *loCost, double *yspread, DiffEvPar dep, gsl_rng * rng) { int i, j; // counting variables - int imin; // index to member with lowest energy + int imin = INT_MAX; // index to member with lowest energy int gen; SimSched *simSched = dep.simSched; const int refresh = dep.refresh; @@ -797,7 +797,7 @@ int diffev(int dim, double estimate[dim], double *loCost, double *yspread, #if 1 // For each point, print cost and parameter vector - if(dep.stateFile) + if(dep.stateFile && imin < INT_MAX) printState(nPts, dim, *pold, cost, imin, dep.stateFile); #endif JobQueue_noMoreJobs(jq); diff --git a/src/state.h b/src/state.h new file mode 100644 index 00000000..17adc46f --- /dev/null +++ b/src/state.h @@ -0,0 +1,7 @@ +#ifndef ARR_STATE_H +#define ARR_STATE_H + +State *State_new(int npts, int npar); +void State_free(State *self); + +#endif From 369b2f99878de8ed8ae3eb89491d4ad8a101ff35 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sat, 3 Mar 2018 16:47:22 -0700 Subject: [PATCH 048/101] Almost finished with xstate.c --- src/gptree.c | 18 ++++++++++ src/gptree.h | 3 +- src/legofit.c | 20 ----------- src/state.c | 82 +++++++++++++++++++++++++++++++++++++++++++-- src/state.h | 11 +++++- src/typedefs.h | 1 + test/Makefile | 10 ++++-- test/xstate.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 210 insertions(+), 26 deletions(-) create mode 100644 test/xstate.c diff --git a/src/gptree.c b/src/gptree.c index 8c7cc9e2..dd6aae79 100644 --- a/src/gptree.c +++ b/src/gptree.c @@ -39,6 +39,24 @@ struct GPTree { SampNdx sndx; // Index of sample pointers into PopNode objects. }; +/// Initialize vector x. If ndx==0, simply copy the parameter vector +/// from the GPTree object. Otherwise, randomize the GPTree first. +/// This ensures that differential evolution starts with a set of +/// points, one of which is the same as the values in the input +/// file. This allows you to improve on existing estimates without +/// starting from scratch each time. +void initStateVec(int ndx, void *void_p, int n, double x[n], gsl_rng *rng){ + GPTree *gpt = (GPTree *) void_p; + if(ndx == 0) + GPTree_getParams(gpt, n, x); + else { + GPTree *g2 = GPTree_dup(gpt); + GPTree_randomize(g2, rng); + GPTree_getParams(g2, n, x); + GPTree_free(g2); + } +} + /// Print a description of parameters. void GPTree_printParStore(GPTree * self, FILE * fp) { if(ParStore_constrain(self->parstore)) diff --git a/src/gptree.h b/src/gptree.h index 1971c59a..1555fead 100644 --- a/src/gptree.h +++ b/src/gptree.h @@ -24,5 +24,6 @@ void GPTree_randomize(GPTree *self, gsl_rng *rng); void GPTree_printParStore(GPTree *self, FILE *fp); void GPTree_printParStoreFree(GPTree *self, FILE *fp); int GPTree_feasible(const GPTree *self, int verbose); - +void initStateVec(int ndx, void *void_p, int n, double x[n], + gsl_rng *rng); #endif diff --git a/src/legofit.c b/src/legofit.c index 05af6521..70d7357a 100644 --- a/src/legofit.c +++ b/src/legofit.c @@ -167,8 +167,6 @@ extern unsigned long rngseed; extern volatile sig_atomic_t sigstat; void usage(void); -void initStateVec(int ndx, void *void_p, int n, double x[n], - gsl_rng *rng); void *ThreadState_new(void *notused); void ThreadState_free(void *rng); @@ -220,24 +218,6 @@ void usage(void) { exit(1); } -/// Initialize vector x. If ndx==0, simply copy the parameter vector -/// from the GPTree object. Otherwise, randomize the GPTree first. -/// This ensures that differential evolution starts with a set of -/// points, one of which is the same as the values in the input -/// file. This allows you to improve on existing estimates without -/// starting from scratch each time. -void initStateVec(int ndx, void *void_p, int n, double x[n], gsl_rng *rng){ - GPTree *gpt = (GPTree *) void_p; - if(ndx == 0) - GPTree_getParams(gpt, n, x); - else { - GPTree *g2 = GPTree_dup(gpt); - GPTree_randomize(g2, rng); - GPTree_getParams(g2, n, x); - GPTree_free(g2); - } -} - int main(int argc, char **argv) { // Install handler for keyboard interrupts. diff --git a/src/state.c b/src/state.c index 25f2bb21..585788fd 100644 --- a/src/state.c +++ b/src/state.c @@ -1,14 +1,42 @@ #include "state.h" +#include "error.h" #include #include - -typedef struct State State; +#include struct State { int npts, npar; // numbers of points and parameters double **s; // s[i][j]=value of j'th param at i'th point }; +int State_npoints(State *self) { + return self->npts; +} + +int State_nparameters(State *self) { + return self->npar; +} + +// Set state vector with index "ndx" equal to vector x. +int State_setVector(State *self, int ndx, int dim, double x[dim]) { + if(dim != self->npar) + return EINVAL; + if(ndx >= self->npts) + return EINVAL; + memcpy(self->s[ndx], x, dim * sizeof(x[0])); + return 0; +} + +// Copy state vector with index "ndx" into vector x. +int State_getVector(State *self, int ndx, int dim, double x[dim]) { + if(dim != self->npar) + return EINVAL; + if(ndx >= self->npts) + return EINVAL; + memcpy(x, self->s[ndx], self->npar * sizeof(self->s[0])); + return 0; +} + // Allocate a new State object. There is no point in optimizing // this, because State is used only at the beginning and end // of Legofit. @@ -35,3 +63,53 @@ void State_free(State *self) { free(self->s); free(self); } + +// Construct a new State object by reading a file +State *State_read(FILE *fp) { + int i, j, npts, npar, status; + status = fscanf(stderr, "%d%d", &npts, &npar); + if(status != 2) { + fprintf(stderr,"%s:%d: Can't read dimensions in state file\n", + __FILE__,__LINE__); + return NULL; + } + State *self = State_new(npts, npar); + CHECKMEM(self); + + for(i=0; i < npts; ++i) { + for(j=0; j < npar; ++j) { + status = fscanf(stderr, "%lf", self->s[i]+j); + if(status != 1) { + fprintf(stderr,"%s:%d:" + " Can't read value (%d,%d) in state file\n", + __FILE__,__LINE__,i,j); + State_free(self); + return NULL; + } + } + } + return self; +} + +// Print State object to a file +int State_print(State *self, FILE *fp) { + int i, j, status; + + status = fprintf(fp, "%d %d\n", self->npts, self->npar); + if(status==0) { + fprintf(stderr,"%s:%d: can't write to file\n", + __FILE__,__LINE__); + return EIO; + } + for(i=0; i < self->npts; ++i) { + for(j=0; j < self->npar; ++j) { + status = fprintf(fp, " %0.18lf", self->s[i][j]); + if(status == 0) { + fprintf(stderr,"%s:%d: can't write to file\n", + __FILE__,__LINE__); + return EIO; + } + } + putc('\n', fp); + } +} diff --git a/src/state.h b/src/state.h index 17adc46f..325fbd2d 100644 --- a/src/state.h +++ b/src/state.h @@ -1,7 +1,16 @@ #ifndef ARR_STATE_H #define ARR_STATE_H +#include "typedefs.h" +#include + +int State_npoints(State *self); +int State_nparameters(State *self); State *State_new(int npts, int npar); -void State_free(State *self); +void State_free(State *self); +State *State_read(FILE *fp); +int State_print(State *self, FILE *fp); +int State_setVector(State *self, int ndx, int dim, double x[dim]); +int State_getVector(State *self, int ndx, int dim, double x[dim]); #endif diff --git a/src/typedefs.h b/src/typedefs.h index a02f6fba..5b1f2ad9 100644 --- a/src/typedefs.h +++ b/src/typedefs.h @@ -26,6 +26,7 @@ typedef struct PopNodeTab PopNodeTab; typedef struct ScrmReader ScrmReader; typedef struct SimSched SimSched; typedef struct SampNdx SampNdx; +typedef struct State State; typedef struct StrInt StrInt; typedef struct Tokenizer Tokenizer; typedef struct DAFReader DAFReader; diff --git a/test/Makefile b/test/Makefile index 1bc81b1b..f4d8ef49 100644 --- a/test/Makefile +++ b/test/Makefile @@ -5,7 +5,8 @@ prof := incl := -I/usr/local/include -I/opt/local/include -I../src tests := xbinary xboot xbranchtab xdafreader xrafreader xdiffev xgene \ xgptree xpopnodetab xjobqueue xlblndx xllrbtree xparkeyval xparse \ - xparstore xpopnode xscrmreader xsimsched xstrint xdtnorm xmisc xerror + xparstore xpopnode xscrmreader xsimsched xstrint xdtnorm xmisc xerror \ + xstate CC := gcc @@ -56,6 +57,7 @@ test : $(tests) -./xpopnodetab -./xscrmreader -./xsimsched + -./xstate -./xstrint @echo "ALL UNIT TESTS WERE COMPLETED." @@ -67,6 +69,10 @@ XMISC := xmisc.o misc.o xmisc : $(XMISC) $(CC) $(CFLAGS) -o $@ $(XMISC) $(lib) +XSTATE := xstate.o misc.o state.o +xstate : $(XSTATE) + $(CC) $(CFLAGS) -o $@ $(XSTATE) $(lib) + XPOPNODETAB := xpopnodetab.o popnodetab.o misc.o popnode.o gene.o \ branchtab.o lblndx.o tokenizer.o dtnorm.o binary.o \ parkeyval.o parstore.o tinyexpr.o @@ -103,7 +109,7 @@ xrafreader : $(XRAFREADER) xscrmreader.o : scrmreader.c $(CC) $(CFLAGS) -c -DTEST -o $@ ../src/scrmreader.c -XSCRMREADER := xscrmreader.o misc.o tokenizer.o +XSCRMREADER := xscrmreader.o misc.o tokenizer.o xscrmreader : $(XSCRMREADER) $(CC) $(CFLAGS) -o $@ $(XSCRMREADER) $(lib) diff --git a/test/xstate.c b/test/xstate.c new file mode 100644 index 00000000..c0239e10 --- /dev/null +++ b/test/xstate.c @@ -0,0 +1,91 @@ +/** + * @file xstate.c + * @author Alan R. Rogers + * @brief Test state.c. + * @copyright Copyright (c) 2018, Alan R. Rogers + * . This file is released under the Internet + * Systems Consortium License, which can be found in file "LICENSE". + */ +#include "typedefs.h" +#include "state.h" +#include "misc.h" +#include +#include +#include +#include + +#ifdef NDEBUG +# error "Unit tests must be compiled without -DNDEBUG flag" +#endif + +int main(int argc, char **argv) { + int verbose=0; + + switch (argc) { + case 1: + break; + case 2: + if(strncmp(argv[1], "-v", 2) != 0) { + fprintf(stderr, "usage: xstate [-v]\n"); + exit(EXIT_FAILURE); + } + verbose = 1; + break; + default: + fprintf(stderr, "usage: xstate [-v]\n"); + exit(EXIT_FAILURE); + } + + const char *fname = "xstate.tmp"; + const int npts=3, npar=2; + int i, status; + double x[npts][npar] = {{1.0, 2.0}, {3.0,4.0}, {5.0, 6.0}}; + State *s = State_new(npts, npar); + CHECKMEM(s); + for(i=0; i < npts; ++i) { + status = State_setVector(s, i, npar, x[i]); + switch(status) { + case 0: + break; + case EINVAL: + fprintf(stderr,"%s:%d: Dimension mismatch in State_setVector\n", + __FILE__,__LINE__); + exit(1); + default: + fprintf(stderr,"%s:%d: Unknown error\n", __FILE__,__LINE__); + exit(1); + } + } + + FILE *fp = fopen(fname, "w"); + assert(fp); + status = State_print(s, fp); + switch(status) { + case 0: + break; + case EIO: + fprintf(stderr,"%s:%d: can't write to file\n", __FILE__,__LINE__); + exit(1); + default: + fprintf(stderr,"%s:%d: Unknown error\n", __FILE__,__LINE__); + exit(1); + } + + State_free(s); + + fp = fopen(fname, "r"); + assert(fp); + s = State_read(fp); + CHECKMEM(s); + fclose(fp); + unlink(fname); + + double y[npar]; + for(i=0; i Date: Sat, 3 Mar 2018 18:16:53 -0700 Subject: [PATCH 049/101] state.c and state.h pass unit test in xstate.c --- .gitignore | 1 + src/state.c | 11 +++++++---- test/xstate.c | 1 + 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index dc887936..265c0f17 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ xpopnode xpopnodetab xsampndx xsimsched +xstate xstrtab xterm xbranchtab diff --git a/src/state.c b/src/state.c index 585788fd..0cca7683 100644 --- a/src/state.c +++ b/src/state.c @@ -1,7 +1,9 @@ #include "state.h" #include "error.h" +#include "misc.h" #include #include +#include #include struct State { @@ -48,8 +50,8 @@ State *State_new(int npts, int npar) { self->npar = npar; self->s = malloc(npts * sizeof(self->s[0])); CHECKMEM(self->s); - for(i=0; i < npar; ++i) { - self->s[i] = malloc(npar * sizeof(self->s[0][0])); + for(i=0; i < npts; ++i) { + self->s[i] = malloc(npar * sizeof(double)); CHECKMEM(self->s[i]); } return self; @@ -67,7 +69,7 @@ void State_free(State *self) { // Construct a new State object by reading a file State *State_read(FILE *fp) { int i, j, npts, npar, status; - status = fscanf(stderr, "%d%d", &npts, &npar); + status = fscanf(fp, "%d %d", &npts, &npar); if(status != 2) { fprintf(stderr,"%s:%d: Can't read dimensions in state file\n", __FILE__,__LINE__); @@ -78,7 +80,7 @@ State *State_read(FILE *fp) { for(i=0; i < npts; ++i) { for(j=0; j < npar; ++j) { - status = fscanf(stderr, "%lf", self->s[i]+j); + status = fscanf(fp, "%lf", self->s[i]+j); if(status != 1) { fprintf(stderr,"%s:%d:" " Can't read value (%d,%d) in state file\n", @@ -112,4 +114,5 @@ int State_print(State *self, FILE *fp) { } putc('\n', fp); } + return 0; } diff --git a/test/xstate.c b/test/xstate.c index c0239e10..ec8b4140 100644 --- a/test/xstate.c +++ b/test/xstate.c @@ -73,6 +73,7 @@ int main(int argc, char **argv) { State_free(s); + fclose(fp); fp = fopen(fname, "r"); assert(fp); s = State_read(fp); From da9bfba47b47e87dded91a6029c3f722e3423deb Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sun, 4 Mar 2018 10:29:15 -0700 Subject: [PATCH 050/101] Legofit reads and writes state file. Unit tests still need to be updated. --- src/Makefile | 6 ++-- src/diffev.c | 14 ++++---- src/diffev.h | 5 ++- src/gptree.c | 3 +- src/gptree.h | 2 +- src/legofit.c | 95 +++++++++++++++++++++++++++++++++++++-------------- src/state.c | 78 ++++++++++++++++++++++++++++-------------- src/state.h | 3 +- 8 files changed, 138 insertions(+), 68 deletions(-) diff --git a/src/Makefile b/src/Makefile index 581de354..d7ccc4f0 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,7 +1,7 @@ destination := $(HOME)/bin global_destination := $(HOME)/group/bin -opt := -DNDEBUG -O3 -finline-functions # For full optimization -#opt := -O0 -fno-inline-functions -rdynamic -DDEBUG # For debugging +#opt := -DNDEBUG -O3 -finline-functions # For full optimization +opt := -O0 -fno-inline-functions -rdynamic -DDEBUG # For debugging #opt := -O3 -finline-functions # Optimization + debugging #prof := -pg -rdynamic # For profiling prof := @@ -52,7 +52,7 @@ legosim : $(LEGOSIM) LEGOFIT := legofit.o patprob.o gptree.o binary.o jobqueue.o misc.o \ parse.o branchtab.o popnodetab.o lblndx.o tokenizer.o parstore.o \ parkeyval.o popnode.o gene.o cost.o diffev.o dprintf.o rngseed.o \ - simsched.o dtnorm.o tinyexpr.o + simsched.o dtnorm.o tinyexpr.o state.o legofit : $(LEGOFIT) $(CC) $(CFLAGS) -o $@ $(LEGOFIT) $(lib) diff --git a/src/diffev.c b/src/diffev.c index b8040ab2..502b1c7b 100644 --- a/src/diffev.c +++ b/src/diffev.c @@ -663,7 +663,8 @@ int diffev(int dim, double estimate[dim], double *loCost, double *yspread, // Initialize array of points for(i = 0; i < nPts; ++i) { - (*dep.initialize)(i, dep.initData, dim, c[i], rng); + status=State_getVector(dep.state, i, dim, c[i]); + assert(status==0); if(dep.jobData) { jobData[i] = (*dep.JobData_dup)(dep.jobData); CHECKMEM(jobData[i]); @@ -745,7 +746,7 @@ int diffev(int dim, double estimate[dim], double *loCost, double *yspread, // accept mutation cost[i] = trial_cost; assignd(dim, (*pnew)[i], targ[i]->v); - if(trial_cost < cmin) { // New minimum. + if(trial_cost < cmin) { // New minimum. cmin = trial_cost; // Reset cmin and imin. imin = i; assignd(dim, best, targ[i]->v); @@ -795,11 +796,6 @@ int diffev(int dim, double estimate[dim], double *loCost, double *yspread, } } -#if 1 - // For each point, print cost and parameter vector - if(dep.stateFile && imin < INT_MAX) - printState(nPts, dim, *pold, cost, imin, dep.stateFile); -#endif JobQueue_noMoreJobs(jq); if(*yspread <= dep.ytol) { status = 0; @@ -814,6 +810,10 @@ int diffev(int dim, double estimate[dim], double *loCost, double *yspread, // Return estimates *loCost = cmin; memcpy(estimate, best, dim * sizeof(estimate[0])); + for(i=0; i < nPts; ++i) { + State_setCost(dep.state, i, cost[i]); + State_setVector(dep.state, i, dim, (*pold)[i]); + } // Free memory for(i = 0; i < nPts; ++i) { diff --git a/src/diffev.h b/src/diffev.h index ba948a11..f785dda3 100644 --- a/src/diffev.h +++ b/src/diffev.h @@ -4,6 +4,7 @@ # define MAXDIM 35 # include "typedefs.h" +# include "state.h" # include # include # include @@ -31,9 +32,7 @@ struct DiffEvPar { void *threadData; void *(*ThreadState_new) (void *); void (*ThreadState_free) (void *); - void *initData; - void (*initialize)(int, void *, int, double *, gsl_rng *rng); - FILE *stateFile; + State *state; }; int diffev(int dim, double estimate[dim], double *loCost, diff --git a/src/gptree.c b/src/gptree.c index dd6aae79..deff2243 100644 --- a/src/gptree.c +++ b/src/gptree.c @@ -45,8 +45,7 @@ struct GPTree { /// points, one of which is the same as the values in the input /// file. This allows you to improve on existing estimates without /// starting from scratch each time. -void initStateVec(int ndx, void *void_p, int n, double x[n], gsl_rng *rng){ - GPTree *gpt = (GPTree *) void_p; +void initStateVec(int ndx, GPTree *gpt, int n, double x[n], gsl_rng *rng){ if(ndx == 0) GPTree_getParams(gpt, n, x); else { diff --git a/src/gptree.h b/src/gptree.h index 1555fead..727a7804 100644 --- a/src/gptree.h +++ b/src/gptree.h @@ -24,6 +24,6 @@ void GPTree_randomize(GPTree *self, gsl_rng *rng); void GPTree_printParStore(GPTree *self, FILE *fp); void GPTree_printParStoreFree(GPTree *self, FILE *fp); int GPTree_feasible(const GPTree *self, int verbose); -void initStateVec(int ndx, void *void_p, int n, double x[n], +void initStateVec(int ndx, GPTree *gpt, int n, double x[n], gsl_rng *rng); #endif diff --git a/src/legofit.c b/src/legofit.c index 70d7357a..95c64a5a 100644 --- a/src/legofit.c +++ b/src/legofit.c @@ -24,7 +24,7 @@ of separations and of episodes of gene flow, and levels of gene flow. add stage with generations and simulation reps -p or --ptsPerDim number of DE points per free var - --stateFile + --stateOut write final state of optimizer to file -1 or --singletons Use singleton site patterns @@ -128,7 +128,7 @@ Second, you can relax the tolerance. By default, this is 1e-4. It is reported in the legofit output. To double this value, use "-T 2e-4" or "--tol 2e-4". -The option "--stateFile" is used to define an output file for the +The option "--stateOut" is used to define an output file for the final state of the optimizer. This output file contains a row for each point in the swarm of points maintained by diffev.c. In each row, the first entry is the value of the cost function at that point. The @@ -148,6 +148,7 @@ Systems Consortium License, which can be found in file "LICENSE". #include "parstore.h" #include "patprob.h" #include "simsched.h" +#include "state.h" #include #include #include @@ -210,7 +211,9 @@ void usage(void) { tellopt("-S @ or --stage @", "add stage with generations and simulation reps"); tellopt("-p or --ptsPerDim ", "number of DE points per free var"); - tellopt("--stateFile ", "write final state of optimizer to file"); + tellopt("--stateIn ", + "read initial state of optimizer from file"); + tellopt("--stateOut ", "write final state of optimizer to file"); tellopt("-1 or --singletons", "Use singleton site patterns"); tellopt("-v or --verbose", "verbose output"); tellopt("--version", "Print version and exit"); @@ -237,7 +240,8 @@ int main(int argc, char **argv) { {"genomeSize", required_argument, 0, 'n'}, #endif {"singletons", no_argument, 0, '1'}, - {"stateFile", required_argument, 0, 'y'}, + {"stateIn", required_argument, 0, 'z'}, + {"stateOut", required_argument, 0, 'y'}, {"help", no_argument, 0, 'h'}, {"verbose", no_argument, 0, 'v'}, {"version", no_argument, 0, 'V'}, @@ -257,8 +261,10 @@ int main(int argc, char **argv) { long simreps = 1000000; char lgofname[200] = { '\0' }; char patfname[200] = { '\0' }; - char statefname[200] = { '\0' }; - FILE *stateFile = NULL; + char stateOutName[200] = { '\0' }; + char stateInName[200] = { '\0' }; + FILE *stateOut = NULL; + FILE *stateIn = NULL; // DiffEv parameters double F = 0.9; @@ -271,7 +277,7 @@ int main(int argc, char **argv) { int strategy = 1; int ptsPerDim = 10; int verbose = 0; - SimSched *simSched = SimSched_new(); + SimSched *simSched = SimSched_new(); #if defined(__DATE__) && defined(__TIME__) printf("# Program was compiled: %s %s\n", __DATE__, __TIME__); @@ -356,16 +362,30 @@ int main(int argc, char **argv) { break; #endif case 'y': - status=snprintf(statefname, sizeof(statefname), "%s", optarg); - if(status >= sizeof(statefname)) { + status=snprintf(stateOutName, sizeof(stateOutName), "%s", optarg); + if(status >= sizeof(stateOutName)) { fprintf(stderr,"%s:%d: buffer overflow\n", __FILE__,__LINE__); exit(EXIT_FAILURE); } - stateFile = fopen(statefname, "w"); - if(stateFile==NULL) { + stateOut = fopen(stateOutName, "w"); + if(stateOut==NULL) { fprintf(stderr,"%s:%d: can't open \"%s\" for output.\n", - __FILE__,__LINE__, statefname); + __FILE__,__LINE__, stateOutName); + exit(EXIT_FAILURE); + } + break; + case 'z': + status=snprintf(stateInName, sizeof(stateInName), "%s", optarg); + if(status >= sizeof(stateInName)) { + fprintf(stderr,"%s:%d: buffer overflow\n", + __FILE__,__LINE__); + exit(EXIT_FAILURE); + } + stateIn = fopen(stateInName, "r"); + if(stateIn==NULL) { + fprintf(stderr,"%s:%d: can't open \"%s\" for output.\n", + __FILE__,__LINE__, stateOutName); exit(EXIT_FAILURE); } break; @@ -425,17 +445,40 @@ int main(int argc, char **argv) { GPTree *gptree = GPTree_new(lgofname, bnd); LblNdx lblndx = GPTree_getLblNdx(gptree); + gsl_rng *rng = gsl_rng_alloc(gsl_rng_taus); + gsl_rng_set(rng, rngseed); + rngseed = (rngseed == ULONG_MAX ? 0 : rngseed+1); + int dim = GPTree_nFree(gptree); // number of free parameters if(dim == 0) { fprintf(stderr,"Error@%s:%d: no free parameters\n", __FILE__,__LINE__); exit(EXIT_FAILURE); } + int npts = dim*ptsPerDim; + + // DiffEv state array is a matrix with a row for each point + // and a column for each parameter. + State *state; + if(stateIn) { + // read State from file + state = State_read(stateIn); + CHECKMEM(state); + }else{ + // de novo State + state = State_new(npts, dim); + CHECKMEM(state); + for(i=0; i < npts; ++i) { + double x[dim]; + initStateVec(i, gptree, dim, x, rng); + State_setVector(state, i, dim, x); + } + } if(nThreads == 0) nThreads = ceil(0.75*getNumCores()); - if(nThreads > dim*ptsPerDim) - nThreads = dim*ptsPerDim; + if(nThreads > npts) + nThreads = npts; printf("# DE strategy : %d\n", strategy); printf("# F : %lf\n", F); @@ -444,9 +487,12 @@ int main(int argc, char **argv) { printf("# nthreads : %d\n", nThreads); printf("# lgo input file : %s\n", lgofname); printf("# site pat input file: %s\n", patfname); - printf("# pts/dimension : %d\n", ptsPerDim); - if(stateFile) - printf("# output state file : %s\n", statefname); + printf("# free parameters : %d\n", dim); + printf("# pts/parameter : %d\n", ptsPerDim); + if(stateIn) + printf("# input state file : %s\n", stateInName); + if(stateOut) + printf("# output state file : %s\n", stateOutName); #if COST!=KL_COST && COST!=LNL_COST printf("# mut_rate/generation: %lg\n", u); printf("# nucleotides/genome : %ld\n", nnuc); @@ -521,20 +567,14 @@ int main(int argc, char **argv) { .threadData = NULL, .ThreadState_new = ThreadState_new, .ThreadState_free = ThreadState_free, - .initData = gptree, - .initialize = initStateVec, + .state = state, .simSched = simSched, .ytol = ytol, - .stateFile = stateFile }; double estimate[dim]; double cost, yspread; - gsl_rng *rng = gsl_rng_alloc(gsl_rng_taus); - gsl_rng_set(rng, rngseed); - rngseed = (rngseed == ULONG_MAX ? 0 : rngseed+1); - printf("Initial parameter values\n"); GPTree_printParStore(gptree, stdout); @@ -588,6 +628,11 @@ int main(int argc, char **argv) { printf("%15s %10.7lf\n", buff2, brlen[ord[j]]); } + if(stateOut) { + State_print(state, stateOut); + fclose(stateOut); + } + BranchTab_free(bt); BranchTab_free(obs); gsl_rng_free(rng); @@ -595,8 +640,6 @@ int main(int argc, char **argv) { GPTree_free(gptree); SimSched_free(simSched); fprintf(stderr,"legofit is finished\n"); - if(stateFile) - fclose(stateFile); return 0; } diff --git a/src/state.c b/src/state.c index 0cca7683..1b852400 100644 --- a/src/state.c +++ b/src/state.c @@ -8,6 +8,7 @@ struct State { int npts, npar; // numbers of points and parameters + double *cost; // cost[i] is cost function at i'th point double **s; // s[i][j]=value of j'th param at i'th point }; @@ -19,14 +20,23 @@ int State_nparameters(State *self) { return self->npar; } +void State_setCost(State *self, int ndx, double cost) { + if(ndx >= self->npts) { + fprintf(stderr,"%s:%d: index out of bounds\n", + __FILE__,__LINE__); + exit(EXIT_FAILURE); + } + self->cost[ndx] = cost; +} + // Set state vector with index "ndx" equal to vector x. -int State_setVector(State *self, int ndx, int dim, double x[dim]) { - if(dim != self->npar) - return EINVAL; - if(ndx >= self->npts) - return EINVAL; +void State_setVector(State *self, int ndx, int dim, double x[dim]) { + if(dim != self->npar || ndx >= self->npts) { + fprintf(stderr,"%s:%d: index out of bounds\n", + __FILE__,__LINE__); + exit(EXIT_FAILURE); + } memcpy(self->s[ndx], x, dim * sizeof(x[0])); - return 0; } // Copy state vector with index "ndx" into vector x. @@ -48,9 +58,12 @@ State *State_new(int npts, int npar) { CHECKMEM(self); self->npts = npts; self->npar = npar; + self->cost = malloc(npts * sizeof(self->cost[0])); + CHECKMEM(self->cost); self->s = malloc(npts * sizeof(self->s[0])); CHECKMEM(self->s); for(i=0; i < npts; ++i) { + self->cost[i] = strtod("NaN", NULL); self->s[i] = malloc(npar * sizeof(double)); CHECKMEM(self->s[i]); } @@ -62,6 +75,7 @@ void State_free(State *self) { int i; for(i=0; i < self->npts; ++i) free(self->s[i]); + free(self->cost); free(self->s); free(self); } @@ -69,27 +83,37 @@ void State_free(State *self) { // Construct a new State object by reading a file State *State_read(FILE *fp) { int i, j, npts, npar, status; + State *self = NULL; status = fscanf(fp, "%d %d", &npts, &npar); if(status != 2) { - fprintf(stderr,"%s:%d: Can't read dimensions in state file\n", - __FILE__,__LINE__); - return NULL; + fprintf(stderr,"%s:%d: status=%d\n", __FILE__,__LINE__,status); + goto fail; } - State *self = State_new(npts, npar); + self = State_new(npts, npar); CHECKMEM(self); for(i=0; i < npts; ++i) { + status = fscanf(fp, "%lf", self->cost + i); + if(status != 1) { + fprintf(stderr,"%s:%d: status=%d\n", __FILE__,__LINE__,status); + goto fail; + } for(j=0; j < npar; ++j) { status = fscanf(fp, "%lf", self->s[i]+j); if(status != 1) { - fprintf(stderr,"%s:%d:" - " Can't read value (%d,%d) in state file\n", - __FILE__,__LINE__,i,j); - State_free(self); - return NULL; + fprintf(stderr,"%s:%d: status=%d\n", __FILE__,__LINE__,status); + goto fail; } } } + if(0) { + fail: + fprintf(stderr,"%s:%d: Can't read state file\n", + __FILE__,__LINE__); + if(self) + State_free(self); + return NULL; + } return self; } @@ -98,21 +122,25 @@ int State_print(State *self, FILE *fp) { int i, j, status; status = fprintf(fp, "%d %d\n", self->npts, self->npar); - if(status==0) { - fprintf(stderr,"%s:%d: can't write to file\n", - __FILE__,__LINE__); - return EIO; - } + if(status==0) + goto fail; + for(i=0; i < self->npts; ++i) { + status = fprintf(fp, "%0.18lf", self->cost[i]); + if(status == 0) + goto fail; for(j=0; j < self->npar; ++j) { status = fprintf(fp, " %0.18lf", self->s[i][j]); - if(status == 0) { - fprintf(stderr,"%s:%d: can't write to file\n", - __FILE__,__LINE__); - return EIO; - } + if(status == 0) + goto fail; } putc('\n', fp); } + if(0) { + fail: + fprintf(stderr,"%s:%d: Can't write state file\n", + __FILE__,__LINE__); + return EIO; + } return 0; } diff --git a/src/state.h b/src/state.h index 325fbd2d..15941e51 100644 --- a/src/state.h +++ b/src/state.h @@ -10,7 +10,8 @@ State *State_new(int npts, int npar); void State_free(State *self); State *State_read(FILE *fp); int State_print(State *self, FILE *fp); -int State_setVector(State *self, int ndx, int dim, double x[dim]); +void State_setVector(State *self, int ndx, int dim, double x[dim]); int State_getVector(State *self, int ndx, int dim, double x[dim]); +void State_setCost(State *self, int ndx, double cost); #endif From b7f87880bd37a250e08ee5660b91b9ae7f1c2089 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sun, 4 Mar 2018 15:24:53 -0700 Subject: [PATCH 051/101] Modified legofit.c and diffev.c. Option --stateIn specifies the name of a state file, which can be read on input. Option --stateOut specifies the state file to be written at the end. This allows Legofit to pick up where it left off. The format of the state file is as follows: Line 1: two integers, the number of points and the number of free parameters. Each remaining line describes one point (one set of parameter values). On these following lines, each field is a floating-point number. The first number is the cost value. The remaining numbers are the values of the free parameters. --- src/Makefile | 4 +-- src/state.c | 40 ++++++++++++--------- src/state.h | 1 + src/try.c | 40 +++++---------------- test/Makefile | 4 +-- test/xbinary.c | 4 +-- test/xdiffev.c | 95 ++++++++++++++++++++++++++++++++------------------ test/xstate.c | 18 ++++------ 8 files changed, 108 insertions(+), 98 deletions(-) diff --git a/src/Makefile b/src/Makefile index d7ccc4f0..f8459615 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,7 +1,7 @@ destination := $(HOME)/bin global_destination := $(HOME)/group/bin -#opt := -DNDEBUG -O3 -finline-functions # For full optimization -opt := -O0 -fno-inline-functions -rdynamic -DDEBUG # For debugging +opt := -DNDEBUG -O3 -finline-functions # For full optimization +#opt := -O0 -fno-inline-functions -rdynamic -DDEBUG # For debugging #opt := -O3 -finline-functions # Optimization + debugging #prof := -pg -rdynamic # For profiling prof := diff --git a/src/state.c b/src/state.c index 1b852400..603a9b2a 100644 --- a/src/state.c +++ b/src/state.c @@ -29,6 +29,16 @@ void State_setCost(State *self, int ndx, double cost) { self->cost[ndx] = cost; } +double State_getCost(State *self, int ndx) { + if(ndx >= self->npts) { + fprintf(stderr,"%s:%d: index out of bounds\n", + __FILE__,__LINE__); + exit(EXIT_FAILURE); + } + return self->cost[ndx]; +} + + // Set state vector with index "ndx" equal to vector x. void State_setVector(State *self, int ndx, int dim, double x[dim]) { if(dim != self->npar || ndx >= self->npts) { @@ -106,15 +116,14 @@ State *State_read(FILE *fp) { } } } - if(0) { - fail: - fprintf(stderr,"%s:%d: Can't read state file\n", - __FILE__,__LINE__); - if(self) - State_free(self); - return NULL; - } return self; + + fail: + fprintf(stderr,"%s:%d: Can't read state file\n", + __FILE__,__LINE__); + if(self) + State_free(self); + return NULL; } // Print State object to a file @@ -126,21 +135,20 @@ int State_print(State *self, FILE *fp) { goto fail; for(i=0; i < self->npts; ++i) { - status = fprintf(fp, "%0.18lf", self->cost[i]); + status = fprintf(fp, "%0.18lg", self->cost[i]); if(status == 0) goto fail; for(j=0; j < self->npar; ++j) { - status = fprintf(fp, " %0.18lf", self->s[i][j]); + status = fprintf(fp, " %0.18lg", self->s[i][j]); if(status == 0) goto fail; } putc('\n', fp); } - if(0) { - fail: - fprintf(stderr,"%s:%d: Can't write state file\n", - __FILE__,__LINE__); - return EIO; - } return 0; + + fail: + fprintf(stderr,"%s:%d: Can't write state file\n", + __FILE__,__LINE__); + return EIO; } diff --git a/src/state.h b/src/state.h index 15941e51..49646852 100644 --- a/src/state.h +++ b/src/state.h @@ -13,5 +13,6 @@ int State_print(State *self, FILE *fp); void State_setVector(State *self, int ndx, int dim, double x[dim]); int State_getVector(State *self, int ndx, int dim, double x[dim]); void State_setCost(State *self, int ndx, double cost); +double State_getCost(State *self, int ndx); #endif diff --git a/src/try.c b/src/try.c index e65e95ab..4bc15112 100644 --- a/src/try.c +++ b/src/try.c @@ -1,36 +1,14 @@ #include -#include -#include int main(void) { - long h; - char token[100], *end; - strcpy(token, "-1"); - - h = strtol(token, &end, 10); - if(end==token || h<0) // token isn't a nonnegative integer - printf("%s is Not a nonnegative integer\n", token); - else // token is a nonnegative integer - printf("%s IS a nonnegative integer: value=%ld\n", token, h); + double x; + int status; + while(1) { + status = scanf("%lf", &x); + if(status != 1) + break; + printf("%g\n", x); + } - strcpy(token, " -eI"); - h = strtol(token, &end, 10); - if(end==token || h<0) // token isn't a nonnegative integer - printf("%s is Not a nonnegative integer\n", token); - else // token is a nonnegative integer - printf("%s IS a nonnegative integer: value=%ld\n", token, h); - - strcpy(token, " -1"); - h = strtol(token, &end, 10); - if(end==token || h<0) // token isn't a nonnegative integer - printf("%s is Not a nonnegative integer\n", token); - else // token is a nonnegative integer - printf("%s IS a nonnegative integer: value=%ld\n", token, h); - - strcpy(token, " 123 "); - h = strtol(token, &end, 10); - if(end==token || h<0) // token isn't a nonnegative integer - printf("%s is Not a nonnegative integer\n", token); - else // token is a nonnegative integer - printf("%s IS a nonnegative integer: value=%ld\n", token, h); + return 0; } diff --git a/test/Makefile b/test/Makefile index f4d8ef49..72408385 100644 --- a/test/Makefile +++ b/test/Makefile @@ -91,8 +91,8 @@ XJOBQUEUE := xjobqueue.o jobqueue.o misc.o xjobqueue : $(XJOBQUEUE) $(CC) $(CFLAGS) -o $@ $(XJOBQUEUE) $(lib) -XDIFFEV := xdiffev.o diffev.o misc.o binary.o lblndx.o jobqueue.o parkeyval.o \ - simsched.o +XDIFFEV := xdiffev.o diffev.o binary.o lblndx.o jobqueue.o parkeyval.o \ + simsched.o state.o misc.o xdiffev : $(XDIFFEV) $(CC) $(CFLAGS) -o $@ $(XDIFFEV) $(lib) diff --git a/test/xbinary.c b/test/xbinary.c index bfaf5cc2..4336169a 100644 --- a/test/xbinary.c +++ b/test/xbinary.c @@ -2,7 +2,7 @@ * @file xbinary.c * @brief Test file binary.c * - * @copyright Copyright (c) 2014, Alan R. Rogers + * @copyright Copyright (c) 2014, Alan R. Rogers * . This file is released under the Internet * Systems Consortium License, which can be found in file "LICENSE". */ @@ -99,7 +99,7 @@ int main(void) { printf("32-bit key %u -> hash %u\n", key32, uint32Hash(key32)); uint64_t key64 = 1234u; - printf("64-bit key %lu -> hash %u\n", key64, uint64Hash(key64)); + printf("64-bit key %llu -> hash %u\n", key64, uint64Hash(key64)); return 0; } diff --git a/test/xdiffev.c b/test/xdiffev.c index 9b0a224a..b46ac011 100644 --- a/test/xdiffev.c +++ b/test/xdiffev.c @@ -1,6 +1,6 @@ #include "diffev.h" -#include "misc.h" #include "simsched.h" +#include "state.h" #include #include #include @@ -11,10 +11,19 @@ #include #include +# define CHECKMEM(x) do { \ + if((x)==NULL) { \ + fprintf(stderr, "%s:%s:%d: allocation error\n", \ + __FILE__,__func__,__LINE__); \ + exit(EXIT_FAILURE); \ + } \ + } while(0) + void usage(void); double objFunc(int dim, double x[dim], void *jdat, void *tdat); void initStateVec(int ndx, void *void_p, int n, double x[n], gsl_rng *rng); +void prOpt(const char *opt, const char *description); #define RUGGED @@ -72,22 +81,29 @@ void initStateVec(int ndx, void *void_p, int n, double x[n], gsl_rng *rng){ #undef RUGGED +/// Describe an option. For use in "usage" functions. +/// @param[in] opt Name of option. +/// @param[in] description Description of option. +void prOpt(const char *opt, const char *description) { + fprintf(stderr, " %s\n %s\n", opt, description); + return; +} + /// Print usage message and exit. void usage(void) { fprintf(stderr, "usage: diffev [options]\n"); fprintf(stderr, " where options may include:\n"); /* misc */ - tellopt("-s or --strategy ", "strategy"); - tellopt("-g or --genmax ", "max generations"); - tellopt("-r or --refresh ", "refresh interval"); - tellopt("-n or --nParam", " number of parameters"); - tellopt("-p or --ptsPerDim ", "points per dimension"); - tellopt("-F or --F ", "DE weight factor"); - tellopt("-c or --crossOver ", "crossover probability"); - tellopt("-t or --threads ", "number of threads (default is auto)"); - tellopt("-v or --verbose", "more output"); - tellopt("-h or --help", "print this message"); + prOpt("-s or --strategy ", "strategy"); + prOpt("-g or --genmax ", "max generations"); + prOpt("-r or --refresh ", "refresh interval"); + prOpt("-n or --nParam", " number of parameters"); + prOpt("-p or --ptsPerDim ", "points per dimension"); + prOpt("-F or --F ", "DE weight factor"); + prOpt("-c or --crossOver ", "crossover probability"); + prOpt("-v or --verbose", "more output"); + prOpt("-h or --help", "print this message"); exit(1); } @@ -96,7 +112,6 @@ int main(int argc, char *argv[]) { static struct option myopts[] = { /* {char *name, int has_arg, int *flag, int val} */ {"strategy", required_argument, 0, 's'}, - {"threads", required_argument, 0, 't'}, {"genmax", required_argument, 0, 'g'}, {"refresh", required_argument, 0, 'r'}, {"nParam", required_argument, 0, 'n'}, @@ -121,7 +136,7 @@ int main(int argc, char *argv[]) { double ytol = 1e-4; // convergence criterion int i; - int nthreads = 0; + int nthreads = 2; time_t currtime = time(NULL); unsigned baseSeed = currtime % UINT_MAX; gsl_rng *rng = gsl_rng_alloc(gsl_rng_taus); @@ -132,7 +147,7 @@ int main(int argc, char *argv[]) { // command line arguments for(;;) { - i = getopt_long(argc, argv, "T:s:t:g:r:n:p:F:c:hv", myopts, &optndx); + i = getopt_long(argc, argv, "T:s:g:r:n:p:F:c:hv", myopts, &optndx); if(i == -1) break; switch (i) { @@ -146,9 +161,6 @@ int main(int argc, char *argv[]) { case 'T': ytol = strtod(optarg, 0); break; - case 't': - nthreads = strtol(optarg, NULL, 10); - break; case 'g': genmax = strtol(optarg, NULL, 10); break; @@ -191,27 +203,45 @@ int main(int argc, char *argv[]) { nPts = ptsPerDim * dim; - if(nthreads == 0) - nthreads = getNumCores(); - printf("Using up to %d threads\n", nthreads); + State *state = State_new(nPts, dim); + CHECKMEM(state); + for(i=0; i < nPts; ++i) { + double x[dim]; + initStateVec(i, initVec, dim, x, rng); + State_setVector(state, i, dim, x); + } + // Check inputs - if(dim <= 0) - eprintf("%s:%d:Err dim=%d, should be > 0\n", __FILE__, __LINE__, dim); - if(nPts <= 0) - eprintf("%s:%d:Err nPts=%d, should be > 0\n", __FILE__, __LINE__, + if(dim <= 0) { + fprintf(stderr, + "%s:%d:Err dim=%d, should be > 0\n", __FILE__, __LINE__, dim); + exit(1); + } + if(nPts <= 0) { + fprintf(stderr, + "%s:%d:Err nPts=%d, should be > 0\n", __FILE__, __LINE__, nPts); + exit(1); + } - if((CR < 0) || (CR > 1.0)) - eprintf("%s:%d:Err CR=%f, should be in [0,1]\n", __FILE__, __LINE__, + if((CR < 0) || (CR > 1.0)) { + fprintf(stderr, + "%s:%d:Err CR=%f, should be in [0,1]\n", __FILE__, __LINE__, CR); - if(refresh <= 0) - eprintf("%s:%d:Err refresh=%d, should be > 0\n", + exit(1); + } + if(refresh <= 0) { + fprintf(stderr,"%s:%d:Err refresh=%d, should be > 0\n", __FILE__, __LINE__, refresh); - if(genmax <= 0) - eprintf("%s:%d:Err genmax=%d, should be > 0\n", + exit(1); + } + if(genmax <= 0) { + fprintf(stderr,"%s:%d:Err genmax=%d, should be > 0\n", __FILE__, __LINE__, genmax); + exit(1); + } printf("Strategy: %s\n", diffEvStrategyLbl(strategy)); printf("nPts=%d F=%-4.2lg CR=%-4.2lg\n", nPts, F, CR); @@ -235,9 +265,8 @@ int main(int argc, char *argv[]) { .threadData = NULL, .ThreadState_new = NULL, .ThreadState_free = NULL, - .initData = initVec, - .initialize = initStateVec, - .simSched = simSched + .simSched = simSched, + .state = state }; double estimate[dim]; diff --git a/test/xstate.c b/test/xstate.c index ec8b4140..ed56aea2 100644 --- a/test/xstate.c +++ b/test/xstate.c @@ -40,21 +40,14 @@ int main(int argc, char **argv) { const int npts=3, npar=2; int i, status; double x[npts][npar] = {{1.0, 2.0}, {3.0,4.0}, {5.0, 6.0}}; + double c[npts] = {0.01, 0.02, 0.03}; State *s = State_new(npts, npar); CHECKMEM(s); + assert(npts == State_npoints(s)); + assert(npar == State_nparameters(s)); for(i=0; i < npts; ++i) { - status = State_setVector(s, i, npar, x[i]); - switch(status) { - case 0: - break; - case EINVAL: - fprintf(stderr,"%s:%d: Dimension mismatch in State_setVector\n", - __FILE__,__LINE__); - exit(1); - default: - fprintf(stderr,"%s:%d: Unknown error\n", __FILE__,__LINE__); - exit(1); - } + State_setVector(s, i, npar, x[i]); + State_setCost(s, i, c[i]); } FILE *fp = fopen(fname, "w"); @@ -85,6 +78,7 @@ int main(int argc, char **argv) { for(i=0; i Date: Sun, 4 Mar 2018 15:44:45 -0700 Subject: [PATCH 052/101] Modified State_print so that point with minimum cost is printed first. --- src/state.c | 21 ++++++++++++++++++++- test/xstate.c | 2 +- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/state.c b/src/state.c index 603a9b2a..555c9832 100644 --- a/src/state.c +++ b/src/state.c @@ -128,13 +128,32 @@ State *State_read(FILE *fp) { // Print State object to a file int State_print(State *self, FILE *fp) { - int i, j, status; + int i, j, status, imin=0; + + // find index of minimum cost + for(i=1; i < self->npts; ++i) + if(self->cost[i] < self->cost[imin]) + imin = i; status = fprintf(fp, "%d %d\n", self->npts, self->npar); if(status==0) goto fail; + // print point with minimum cost first + status = fprintf(fp, "%0.18lg", self->cost[imin]); + if(status == 0) + goto fail; + for(j=0; j < self->npar; ++j) { + status = fprintf(fp, " %0.18lg", self->s[imin][j]); + if(status == 0) + goto fail; + } + putc('\n', fp); + + // print remainint lines for(i=0; i < self->npts; ++i) { + if(i == imin) + continue; status = fprintf(fp, "%0.18lg", self->cost[i]); if(status == 0) goto fail; diff --git a/test/xstate.c b/test/xstate.c index ed56aea2..ccbfa063 100644 --- a/test/xstate.c +++ b/test/xstate.c @@ -39,7 +39,7 @@ int main(int argc, char **argv) { const char *fname = "xstate.tmp"; const int npts=3, npar=2; int i, status; - double x[npts][npar] = {{1.0, 2.0}, {3.0,4.0}, {5.0, 6.0}}; + double x[npts][npar] = {{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}; double c[npts] = {0.01, 0.02, 0.03}; State *s = State_new(npts, npar); CHECKMEM(s); From 86a62cbba1e9e85fc33a601897e4e2692b8d0e61 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sun, 4 Mar 2018 16:16:38 -0700 Subject: [PATCH 053/101] Version 1.5 --- src/legofit.c | 267 +++++++++++++++++++++++++------------------------- src/version.h | 2 +- 2 files changed, 135 insertions(+), 134 deletions(-) diff --git a/src/legofit.c b/src/legofit.c index 95c64a5a..52177a2b 100644 --- a/src/legofit.c +++ b/src/legofit.c @@ -1,3 +1,4 @@ + /** @file legofit.c @page legofit @@ -167,42 +168,43 @@ extern pthread_mutex_t seedLock; extern unsigned long rngseed; extern volatile sig_atomic_t sigstat; -void usage(void); -void *ThreadState_new(void *notused); -void ThreadState_free(void *rng); +void usage(void); +void *ThreadState_new(void *notused); +void ThreadState_free(void *rng); void *ThreadState_new(void *notused) { - // Lock seed, initialize random number generator, increment seed, - // and unlock. - gsl_rng *rng = gsl_rng_alloc(gsl_rng_taus); + // Lock seed, initialize random number generator, increment seed, + // and unlock. + gsl_rng *rng = gsl_rng_alloc(gsl_rng_taus); - pthread_mutex_lock(&seedLock); + pthread_mutex_lock(&seedLock); gsl_rng_set(rng, rngseed); - rngseed = (rngseed == ULONG_MAX ? 0 : rngseed+1); - pthread_mutex_unlock(&seedLock); + rngseed = (rngseed == ULONG_MAX ? 0 : rngseed + 1); + pthread_mutex_unlock(&seedLock); return rng; } void ThreadState_free(void *rng) { - gsl_rng_free( (gsl_rng *) rng ); + gsl_rng_free((gsl_rng *) rng); } void usage(void) { #if COST==KL_COST || COST==LNL_COST - fprintf(stderr,"usage: legofit [options] input.lgo sitepat.txt\n"); - fprintf(stderr," where file input.lgo describes population history,\n" + fprintf(stderr, "usage: legofit [options] input.lgo sitepat.txt\n"); + fprintf(stderr, " where file input.lgo describes population history,\n" " and file sitepat.txt contains site pattern frequencies.\n"); #else - fprintf(stderr,"usage: legofit [options] -u " + fprintf(stderr, "usage: legofit [options] -u " " -n input.lgo sitepat.txt\n"); - fprintf(stderr," where is the mutation rate per nucleotide\n" + fprintf(stderr, + " where is the mutation rate per nucleotide\n" " site per generation, is the number of\n" " nucleotides per haploid genome, file input.lgo describes\n" " population history, and file sitepat.txt contains site\n" " pattern frequencies.\n"); #endif - fprintf(stderr,"Options may include:\n"); + fprintf(stderr, "Options may include:\n"); tellopt("-T or --tol ", "termination criterion"); tellopt("-t or --threads ", "number of threads (default is auto)"); tellopt("-F or --scaleFactor ", "set DE scale factor"); @@ -213,8 +215,9 @@ void usage(void) { tellopt("-p or --ptsPerDim ", "number of DE points per free var"); tellopt("--stateIn ", "read initial state of optimizer from file"); - tellopt("--stateOut ", "write final state of optimizer to file"); - tellopt("-1 or --singletons", "Use singleton site patterns"); + tellopt("--stateOut ", + "write final state of optimizer to file"); + tellopt("-1 or --singletons", "Use singleton site patterns"); tellopt("-v or --verbose", "verbose output"); tellopt("--version", "Print version and exit"); tellopt("-h or --help", "print this message"); @@ -229,8 +232,8 @@ int main(int argc, char **argv) { static struct option myopts[] = { /* {char *name, int has_arg, int *flag, int val} */ {"threads", required_argument, 0, 't'}, - {"crossover", required_argument, 0, 'x'}, - {"scaleFactor", required_argument, 0, 'F'}, + {"crossover", required_argument, 0, 'x'}, + {"scaleFactor", required_argument, 0, 'F'}, {"strategy", required_argument, 0, 's'}, {"stage", required_argument, 0, 'S'}, {"tol", required_argument, 0, 'T'}, @@ -250,34 +253,34 @@ int main(int argc, char **argv) { hdr("legofit: estimate population history"); - int i, j; - time_t currtime = time(NULL); - unsigned long pid = (unsigned long) getpid(); - double lo_twoN = 1.0, hi_twoN = 1e7; // twoN bounds - double lo_t = 0.0, hi_t = 1e7; // t bounds - int nThreads = 0; // total number of threads - int doSing=0; // nonzero means use singleton site patterns - int status, optndx; - long simreps = 1000000; - char lgofname[200] = { '\0' }; - char patfname[200] = { '\0' }; - char stateOutName[200] = { '\0' }; - char stateInName[200] = { '\0' }; - FILE *stateOut = NULL; - FILE *stateIn = NULL; - - // DiffEv parameters - double F = 0.9; - double CR = 0.8; + int i, j; + time_t currtime = time(NULL); + unsigned long pid = (unsigned long) getpid(); + double lo_twoN = 1.0, hi_twoN = 1e7; // twoN bounds + double lo_t = 0.0, hi_t = 1e7; // t bounds + int nThreads = 0; // total number of threads + int doSing = 0; // nonzero means use singleton site patterns + int status, optndx; + long simreps = 1000000; + char lgofname[200] = { '\0' }; + char patfname[200] = { '\0' }; + char stateOutName[200] = { '\0' }; + char stateInName[200] = { '\0' }; + FILE *stateOut = NULL; + FILE *stateIn = NULL; + + // DiffEv parameters + double F = 0.9; + double CR = 0.8; #if COST!=KL_COST && COST!=LNL_COST - double u = 0.0; // mutation rate per site per generation - long nnuc = 0; // number of nucleotides per haploid genome + double u = 0.0; // mutation rate per site per generation + long nnuc = 0; // number of nucleotides per haploid genome #endif - double ytol = 1e-4; // stop when yspread <= ytol - int strategy = 1; - int ptsPerDim = 10; - int verbose = 0; - SimSched *simSched = SimSched_new(); + double ytol = 1e-4; // stop when yspread <= ytol + int strategy = 1; + int ptsPerDim = 10; + int verbose = 0; + SimSched *simSched = SimSched_new(); #if defined(__DATE__) && defined(__TIME__) printf("# Program was compiled: %s %s\n", __DATE__, __TIME__); @@ -290,13 +293,12 @@ int main(int argc, char **argv) { putchar('\n'); fflush(stdout); - rngseed = currtime^pid; + rngseed = currtime ^ pid; // command line arguments for(;;) { #if COST==KL_COST || COST==LNL_COST - i = getopt_long(argc, argv, "T:t:F:p:s:S:a:vx:1h", - myopts, &optndx); + i = getopt_long(argc, argv, "T:t:F:p:s:S:a:vx:1h", myopts, &optndx); #else i = getopt_long(argc, argv, "T:t:F:p:s:S:a:vx:u:n:1h", myopts, &optndx); @@ -311,36 +313,34 @@ int main(int argc, char **argv) { case 't': nThreads = strtol(optarg, NULL, 10); break; - case 'F': - F = strtod(optarg, 0); - break; + case 'F': + F = strtod(optarg, 0); + break; case 'p': ptsPerDim = strtol(optarg, NULL, 10); break; - case 's': - strategy = strtol(optarg, NULL, 10); + case 's': + strategy = strtol(optarg, NULL, 10); break; - case 'S': - { - // Add a stage to simSched. - char b[20], *g, *r; - status = snprintf(b, sizeof b, "%s", optarg); - if(status >= sizeof b) { - fprintf(stderr,"%s:%d: buffer overflow reading arg %s\n", - __FILE__,__LINE__,optarg); - exit(EXIT_FAILURE); - } - g = r = b; - (void) strsep(&r, "@"); - if(r==NULL - || strlen(r) == 0 - || strlen(g) == 0) - usage(); - long stageGen = strtol(g, NULL, 10); - long stageRep = strtol(r, NULL, 10); - simreps = stageRep; - SimSched_append(simSched, stageGen, stageRep); + case 'S': + { + // Add a stage to simSched. + char b[20], *g, *r; + status = snprintf(b, sizeof b, "%s", optarg); + if(status >= sizeof b) { + fprintf(stderr, "%s:%d: buffer overflow reading arg %s\n", + __FILE__, __LINE__, optarg); + exit(EXIT_FAILURE); } + g = r = b; + (void) strsep(&r, "@"); + if(r == NULL || strlen(r) == 0 || strlen(g) == 0) + usage(); + long stageGen = strtol(g, NULL, 10); + long stageRep = strtol(r, NULL, 10); + simreps = stageRep; + SimSched_append(simSched, stageGen, stageRep); + } break; case 'v': verbose = 1; @@ -350,9 +350,9 @@ int main(int argc, char **argv) { case 'T': ytol = strtod(optarg, 0); break; - case 'x': - CR = strtod(optarg, 0); - break; + case 'x': + CR = strtod(optarg, 0); + break; #if COST!=KL_COST && COST!=LNL_COST case 'u': u = strtod(optarg, 0); @@ -362,41 +362,42 @@ int main(int argc, char **argv) { break; #endif case 'y': - status=snprintf(stateOutName, sizeof(stateOutName), "%s", optarg); + status = + snprintf(stateOutName, sizeof(stateOutName), "%s", optarg); if(status >= sizeof(stateOutName)) { - fprintf(stderr,"%s:%d: buffer overflow\n", - __FILE__,__LINE__); + fprintf(stderr, "%s:%d: buffer overflow\n", + __FILE__, __LINE__); exit(EXIT_FAILURE); } stateOut = fopen(stateOutName, "w"); - if(stateOut==NULL) { - fprintf(stderr,"%s:%d: can't open \"%s\" for output.\n", - __FILE__,__LINE__, stateOutName); + if(stateOut == NULL) { + fprintf(stderr, "%s:%d: can't open \"%s\" for output.\n", + __FILE__, __LINE__, stateOutName); exit(EXIT_FAILURE); } break; case 'z': - status=snprintf(stateInName, sizeof(stateInName), "%s", optarg); + status = snprintf(stateInName, sizeof(stateInName), "%s", optarg); if(status >= sizeof(stateInName)) { - fprintf(stderr,"%s:%d: buffer overflow\n", - __FILE__,__LINE__); + fprintf(stderr, "%s:%d: buffer overflow\n", + __FILE__, __LINE__); exit(EXIT_FAILURE); } stateIn = fopen(stateInName, "r"); - if(stateIn==NULL) { - fprintf(stderr,"%s:%d: can't open \"%s\" for output.\n", - __FILE__,__LINE__, stateOutName); + if(stateIn == NULL) { + fprintf(stderr, "%s:%d: can't open \"%s\" for output.\n", + __FILE__, __LINE__, stateOutName); exit(EXIT_FAILURE); } break; case '1': - doSing=1; + doSing = 1; break; case 'h': usage(); break; default: - fprintf(stderr,"Can't parse option %c\n", i); + fprintf(stderr, "Can't parse option %c\n", i); usage(); } } @@ -406,22 +407,22 @@ int main(int argc, char **argv) { fprintf(stderr, "Command line must specify 2 input files.\n"); usage(); } - #if COST!=KL_COST && COST!=LNL_COST - if(u==0.0) { - fprintf(stderr,"Use -u to set mutation rate per generation.\n"); + if(u == 0.0) { + fprintf(stderr, "Use -u to set mutation rate per generation.\n"); usage(); } - if(nnuc==0) { - fprintf(stderr,"Use -n to set # of nucleotides per haploid genome.\n"); + if(nnuc == 0) { + fprintf(stderr, + "Use -n to set # of nucleotides per haploid genome.\n"); usage(); } #endif snprintf(lgofname, sizeof(lgofname), "%s", argv[optind]); assert(lgofname[0] != '\0'); - snprintf(patfname, sizeof(patfname), "%s", argv[optind+1]); + snprintf(patfname, sizeof(patfname), "%s", argv[optind + 1]); assert(patfname[0] != '\0'); // Default simulation schedule. @@ -436,26 +437,26 @@ int main(int argc, char **argv) { SimSched_print(simSched, stdout); Bounds bnd = { - .lo_twoN = lo_twoN, - .hi_twoN = hi_twoN, - .lo_t = lo_t, - .hi_t = hi_t + .lo_twoN = lo_twoN, + .hi_twoN = hi_twoN, + .lo_t = lo_t, + .hi_t = hi_t }; GPTree *gptree = GPTree_new(lgofname, bnd); - LblNdx lblndx = GPTree_getLblNdx(gptree); + LblNdx lblndx = GPTree_getLblNdx(gptree); - gsl_rng *rng = gsl_rng_alloc(gsl_rng_taus); + gsl_rng *rng = gsl_rng_alloc(gsl_rng_taus); gsl_rng_set(rng, rngseed); - rngseed = (rngseed == ULONG_MAX ? 0 : rngseed+1); + rngseed = (rngseed == ULONG_MAX ? 0 : rngseed + 1); int dim = GPTree_nFree(gptree); // number of free parameters if(dim == 0) { - fprintf(stderr,"Error@%s:%d: no free parameters\n", - __FILE__,__LINE__); + fprintf(stderr, "Error@%s:%d: no free parameters\n", + __FILE__, __LINE__); exit(EXIT_FAILURE); } - int npts = dim*ptsPerDim; + int npts = dim * ptsPerDim; // DiffEv state array is a matrix with a row for each point // and a column for each parameter. @@ -464,11 +465,11 @@ int main(int argc, char **argv) { // read State from file state = State_read(stateIn); CHECKMEM(state); - }else{ + } else { // de novo State state = State_new(npts, dim); CHECKMEM(state); - for(i=0; i < npts; ++i) { + for(i = 0; i < npts; ++i) { double x[dim]; initStateVec(i, gptree, dim, x, rng); State_setVector(state, i, dim, x); @@ -476,7 +477,7 @@ int main(int argc, char **argv) { } if(nThreads == 0) - nThreads = ceil(0.75*getNumCores()); + nThreads = ceil(0.75 * getNumCores()); if(nThreads > npts) nThreads = npts; @@ -517,18 +518,18 @@ int main(int argc, char **argv) { BranchTab *obs = BranchTab_parse(patfname, &lblndx); if(doSing) { if(!BranchTab_hasSingletons(obs)) { - fprintf(stderr,"%s:%d: Command line includes singletons " + fprintf(stderr, "%s:%d: Command line includes singletons " "(-1 or --singletons)\n" " but none are present in \"%s\".\n", - __FILE__,__LINE__, patfname); + __FILE__, __LINE__, patfname); exit(EXIT_FAILURE); } - }else{ + } else { if(BranchTab_hasSingletons(obs)) { - fprintf(stderr,"%s:%d: Command line excludes singletons " + fprintf(stderr, "%s:%d: Command line excludes singletons " "(neither -1 nor --singletons)\n" " but singletons are present in \"%s\".\n", - __FILE__,__LINE__, patfname); + __FILE__, __LINE__, patfname); exit(EXIT_FAILURE); } } @@ -550,33 +551,33 @@ int main(int argc, char **argv) { }; // parameters for Differential Evolution - DiffEvPar dep = { + DiffEvPar dep = { .dim = dim, .ptsPerDim = ptsPerDim, - .refresh = 2, // how often to print a line of output + .refresh = 2, // how often to print a line of output .strategy = strategy, .nthreads = nThreads, .verbose = verbose, - .seed = ((unsigned long) time(NULL))-1ul, + .seed = ((unsigned long) time(NULL)) - 1ul, .F = F, .CR = CR, - .jobData = &costPar, + .jobData = &costPar, .JobData_dup = CostPar_dup, .JobData_free = CostPar_free, .objfun = costFun, - .threadData = NULL, - .ThreadState_new = ThreadState_new, - .ThreadState_free = ThreadState_free, + .threadData = NULL, + .ThreadState_new = ThreadState_new, + .ThreadState_free = ThreadState_free, .state = state, .simSched = simSched, .ytol = ytol, }; - double estimate[dim]; - double cost, yspread; + double estimate[dim]; + double cost, yspread; printf("Initial parameter values\n"); - GPTree_printParStore(gptree, stdout); + GPTree_printParStore(gptree, stdout); // Flush just before diffev so output file will be as complete as // possible while diffev is running. @@ -585,16 +586,16 @@ int main(int argc, char **argv) { status = diffev(dim, estimate, &cost, &yspread, dep, rng); printf("DiffEv %s. cost=%0.5le spread=%0.5le\n", - status==0 ? "converged" : "FAILED", cost, yspread); + status == 0 ? "converged" : "FAILED", cost, yspread); #if COST==LNL_COST - printf(" relspread=%e", yspread/cost); + printf(" relspread=%e", yspread / cost); #endif putchar('\n'); // Get mean site pattern branch lengths if(GPTree_setParams(gptree, dim, estimate)) { - fprintf(stderr,"%s:%d: free params violate constraints\n", - __FILE__,__LINE__); + fprintf(stderr, "%s:%d: free params violate constraints\n", + __FILE__, __LINE__); exit(1); } BranchTab *bt = patprob(gptree, simreps, doSing, rng); @@ -603,9 +604,9 @@ int main(int argc, char **argv) { printf("Fitted parameter values\n"); #if 1 - GPTree_printParStoreFree(gptree, stdout); + GPTree_printParStoreFree(gptree, stdout); #else - GPTree_printParStore(gptree, stdout); + GPTree_printParStore(gptree, stdout); #endif // Put site patterns and branch lengths into arrays. @@ -620,9 +621,9 @@ int main(int argc, char **argv) { orderpat(npat, ord, pat); printf("#%14s %10s\n", "SitePat", "BranchLen"); - char buff[100]; + char buff[100]; for(j = 0; j < npat; ++j) { - char buff2[100]; + char buff2[100]; snprintf(buff2, sizeof(buff2), "%s", patLbl(sizeof(buff), buff, pat[ord[j]], &lblndx)); printf("%15s %10.7lf\n", buff2, brlen[ord[j]]); @@ -639,7 +640,7 @@ int main(int argc, char **argv) { GPTree_sanityCheck(gptree, __FILE__, __LINE__); GPTree_free(gptree); SimSched_free(simSched); - fprintf(stderr,"legofit is finished\n"); + fprintf(stderr, "legofit is finished\n"); return 0; } diff --git a/src/version.h b/src/version.h index e3c9c610..ea891a41 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.4" +#define VERSION "1.5" #endif From 975b21e34740517e27e6ae2c28a885dd7f142084 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 6 Mar 2018 10:38:55 -0700 Subject: [PATCH 054/101] Added function State_readList, which creates a State object from a list of file names. Each file name should refer to a file in the format written by State_print. State_readList builds a State object that includes points evenly distributed across all the files in the list. --- src/Makefile | 4 +-- src/legofit.c | 5 ++- src/state.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++- src/state.h | 4 +++ src/typedefs.h | 1 + test/xstate.c | 62 ++++++++++++++++++++++++++++----- 6 files changed, 158 insertions(+), 12 deletions(-) diff --git a/src/Makefile b/src/Makefile index f8459615..d7ccc4f0 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,7 +1,7 @@ destination := $(HOME)/bin global_destination := $(HOME)/group/bin -opt := -DNDEBUG -O3 -finline-functions # For full optimization -#opt := -O0 -fno-inline-functions -rdynamic -DDEBUG # For debugging +#opt := -DNDEBUG -O3 -finline-functions # For full optimization +opt := -O0 -fno-inline-functions -rdynamic -DDEBUG # For debugging #opt := -O3 -finline-functions # Optimization + debugging #prof := -pg -rdynamic # For profiling prof := diff --git a/src/legofit.c b/src/legofit.c index 52177a2b..3b401e8a 100644 --- a/src/legofit.c +++ b/src/legofit.c @@ -1,4 +1,3 @@ - /** @file legofit.c @page legofit @@ -25,12 +24,16 @@ of separations and of episodes of gene flow, and levels of gene flow. add stage with generations and simulation reps -p or --ptsPerDim number of DE points per free var + --stateIn + read initial state of optimizer from file --stateOut write final state of optimizer to file -1 or --singletons Use singleton site patterns -v or --verbose verbose output + --version + Print version and exit -h or --help print this message diff --git a/src/state.c b/src/state.c index 555c9832..4fb55a31 100644 --- a/src/state.c +++ b/src/state.c @@ -6,12 +6,41 @@ #include #include +struct NameList { + char *name; + struct NameList *next; +}; + struct State { int npts, npar; // numbers of points and parameters double *cost; // cost[i] is cost function at i'th point double **s; // s[i][j]=value of j'th param at i'th point }; +NameList *NameList_append(NameList *self, const char *name) { + if(self == NULL) { + self = malloc(sizeof(NameList)); + CHECKMEM(self); + self->name = strdup(name); + self->next = NULL; + }else + self->next = NameList_append(self->next, name); + return self; +} + +void NameList_free(NameList *self) { + if(self == NULL) + return; + NameList_free(self->next); + free(self); +} + +int NameList_size(NameList *self) { + if(self==0) + return 0; + return 1 + NameList_size(self->next); +} + int State_npoints(State *self) { return self->npts; } @@ -150,7 +179,7 @@ int State_print(State *self, FILE *fp) { } putc('\n', fp); - // print remainint lines + // print remaining lines for(i=0; i < self->npts; ++i) { if(i == imin) continue; @@ -171,3 +200,66 @@ int State_print(State *self, FILE *fp) { __FILE__,__LINE__); return EIO; } + +State *State_readList(NameList *list) { + int nstates = NameList_size(list); + if(nstates==0) + return NULL; + + State *state[nstates]; + NameList *node; + int i, j, k, npts=-1, npar; + + for(i=0, node=list; inext) { + + // Create a State object from each file name in list. + FILE *fp = fopen(node->name, "r"); + state[i] = State_read(fp); + CHECKMEM(state[i]); + fclose(fp); + + // Make sure dimensions are compatible. + if(npts < 0) { + npts = State_npoints(state[i]); + npar = State_nparameters(state[i]); + }else{ + if(npts != State_npoints(state[i]) + || npar != State_nparameters(state[i])) { + fprintf(stderr,"%s:%s:%d:" + " input state file \"%s\" has" + " incompatible dimensions.\n", + __FILE__,__func__,__LINE__, + node->name); + exit(EXIT_FAILURE); + } + } + } + + State *self = State_new(npts, npar); + CHECKMEM(self); + + // Figure out how many points to take from each State object. + // The goal is to take nearly equal numbers. + div_t qr = div(npts, nstates); + int npoints[nstates]; + int total=0; // for debugging + for(i=0; i < nstates; ++i) { + npoints[i] = qr.quot; + if(i < qr.rem) + ++npoints[i]; + total += npoints[i]; // for debugging + } + assert(total == npts); + + // Set vectors; free pointers in vector "state". + k = 0; // index into self->s + for(i=0; i < nstates; ++i) { + for(j=0; j < npoints[i]; ++j) { + State_setCost(self, k, state[i]->cost[j]); + State_setVector(self, k, npar, state[i]->s[j]); + ++k; + } + State_free(state[i]); + } + return self; +} diff --git a/src/state.h b/src/state.h index 49646852..55d136b8 100644 --- a/src/state.h +++ b/src/state.h @@ -4,6 +4,9 @@ #include "typedefs.h" #include +NameList *NameList_append(NameList *self, const char *name); +void NameList_free(NameList *self); +int NameList_size(NameList *self); int State_npoints(State *self); int State_nparameters(State *self); State *State_new(int npts, int npar); @@ -14,5 +17,6 @@ void State_setVector(State *self, int ndx, int dim, double x[dim]); int State_getVector(State *self, int ndx, int dim, double x[dim]); void State_setCost(State *self, int ndx, double cost); double State_getCost(State *self, int ndx); +State *State_readList(NameList *list); #endif diff --git a/src/typedefs.h b/src/typedefs.h index 5b1f2ad9..be86710f 100644 --- a/src/typedefs.h +++ b/src/typedefs.h @@ -16,6 +16,7 @@ typedef struct GPTree GPTree; typedef struct HashTab HashTab; typedef struct HashTabSeq HashTabSeq; typedef struct LblNdx LblNdx; +typedef struct NameList NameList; typedef struct NodeStore NodeStore; typedef enum ParamStatus ParamStatus; typedef enum ParamType ParamType; diff --git a/test/xstate.c b/test/xstate.c index ccbfa063..a459ea63 100644 --- a/test/xstate.c +++ b/test/xstate.c @@ -37,17 +37,29 @@ int main(int argc, char **argv) { } const char *fname = "xstate.tmp"; + const char *fname2 = "xstate2.tmp"; + + NameList *list=NULL; + list = NameList_append(list, fname); + list = NameList_append(list, fname2); + assert(2 == NameList_size(list)); + const int npts=3, npar=2; int i, status; - double x[npts][npar] = {{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}; - double c[npts] = {0.01, 0.02, 0.03}; + double x1[npts][npar] = {{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}; + double x2[npts][npar] = {{7.0, 8.0}, {9.0, 10.0}, {11.0, 12.0}}; + double c1[npts] = {0.01, 0.02, 0.03}; + double c2[npts] = {0.04, 0.05, 0.06}; State *s = State_new(npts, npar); + State *s2 = State_new(npts, npar); CHECKMEM(s); assert(npts == State_npoints(s)); assert(npar == State_nparameters(s)); for(i=0; i < npts; ++i) { - State_setVector(s, i, npar, x[i]); - State_setCost(s, i, c[i]); + State_setVector(s, i, npar, x1[i]); + State_setCost(s, i, c1[i]); + State_setVector(s2, i, npar, x2[i]); + State_setCost(s2, i, c2[i]); } FILE *fp = fopen(fname, "w"); @@ -63,8 +75,23 @@ int main(int argc, char **argv) { fprintf(stderr,"%s:%d: Unknown error\n", __FILE__,__LINE__); exit(1); } + fclose(fp); - State_free(s); + fp = fopen(fname2, "w"); + assert(fp); + status = State_print(s2, fp); + switch(status) { + case 0: + break; + case EIO: + fprintf(stderr,"%s:%d: can't write to file\n", __FILE__,__LINE__); + exit(1); + default: + fprintf(stderr,"%s:%d: Unknown error\n", __FILE__,__LINE__); + exit(1); + } + + State_free(s2); fclose(fp); fp = fopen(fname, "r"); @@ -72,15 +99,34 @@ int main(int argc, char **argv) { s = State_read(fp); CHECKMEM(s); fclose(fp); - unlink(fname); double y[npar]; for(i=0; i Date: Tue, 6 Mar 2018 11:01:04 -0700 Subject: [PATCH 055/101] Modified legofit. It is now possible to list the --stateIn option several times, each time specifying a different file. The files should have consistent numbers of points and parameters. When several state files are provided, the initial state of the differential evolution algorithm combines points from all of state files. --- src/Makefile | 4 ++-- src/legofit.c | 34 +++++++++++++--------------------- src/state.c | 7 +++++++ src/state.h | 1 + 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/Makefile b/src/Makefile index d7ccc4f0..f8459615 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,7 +1,7 @@ destination := $(HOME)/bin global_destination := $(HOME)/group/bin -#opt := -DNDEBUG -O3 -finline-functions # For full optimization -opt := -O0 -fno-inline-functions -rdynamic -DDEBUG # For debugging +opt := -DNDEBUG -O3 -finline-functions # For full optimization +#opt := -O0 -fno-inline-functions -rdynamic -DDEBUG # For debugging #opt := -O3 -finline-functions # Optimization + debugging #prof := -pg -rdynamic # For profiling prof := diff --git a/src/legofit.c b/src/legofit.c index 3b401e8a..4a8637bf 100644 --- a/src/legofit.c +++ b/src/legofit.c @@ -25,7 +25,7 @@ of separations and of episodes of gene flow, and levels of gene flow. -p or --ptsPerDim number of DE points per free var --stateIn - read initial state of optimizer from file + read initial state of optimizer from file. Option may be repeated. --stateOut write final state of optimizer to file -1 or --singletons @@ -217,9 +217,9 @@ void usage(void) { "add stage with generations and simulation reps"); tellopt("-p or --ptsPerDim ", "number of DE points per free var"); tellopt("--stateIn ", - "read initial state of optimizer from file"); + "read initial state from file. Option may be repeated."); tellopt("--stateOut ", - "write final state of optimizer to file"); + "write final state to file"); tellopt("-1 or --singletons", "Use singleton site patterns"); tellopt("-v or --verbose", "verbose output"); tellopt("--version", "Print version and exit"); @@ -268,9 +268,8 @@ int main(int argc, char **argv) { char lgofname[200] = { '\0' }; char patfname[200] = { '\0' }; char stateOutName[200] = { '\0' }; - char stateInName[200] = { '\0' }; + NameList *stateInNames = NULL; FILE *stateOut = NULL; - FILE *stateIn = NULL; // DiffEv parameters double F = 0.9; @@ -380,18 +379,8 @@ int main(int argc, char **argv) { } break; case 'z': - status = snprintf(stateInName, sizeof(stateInName), "%s", optarg); - if(status >= sizeof(stateInName)) { - fprintf(stderr, "%s:%d: buffer overflow\n", - __FILE__, __LINE__); - exit(EXIT_FAILURE); - } - stateIn = fopen(stateInName, "r"); - if(stateIn == NULL) { - fprintf(stderr, "%s:%d: can't open \"%s\" for output.\n", - __FILE__, __LINE__, stateOutName); - exit(EXIT_FAILURE); - } + stateInNames = NameList_append(stateInNames, optarg); + CHECKMEM(stateInNames); break; case '1': doSing = 1; @@ -464,9 +453,9 @@ int main(int argc, char **argv) { // DiffEv state array is a matrix with a row for each point // and a column for each parameter. State *state; - if(stateIn) { + if(stateInNames) { // read State from file - state = State_read(stateIn); + state = State_readList(stateInNames); CHECKMEM(state); } else { // de novo State @@ -493,8 +482,11 @@ int main(int argc, char **argv) { printf("# site pat input file: %s\n", patfname); printf("# free parameters : %d\n", dim); printf("# pts/parameter : %d\n", ptsPerDim); - if(stateIn) - printf("# input state file : %s\n", stateInName); + if(stateInNames) { + printf("# input state file(s):"); + NameList_print(stateInNames, stdout); + putchar('\n'); + } if(stateOut) printf("# output state file : %s\n", stateOutName); #if COST!=KL_COST && COST!=LNL_COST diff --git a/src/state.c b/src/state.c index 4fb55a31..51fc2df5 100644 --- a/src/state.c +++ b/src/state.c @@ -41,6 +41,13 @@ int NameList_size(NameList *self) { return 1 + NameList_size(self->next); } +void NameList_print(NameList *self, FILE *fp) { + if(self==NULL) + return; + fprintf(fp, " %s", self->name); + NameList_print(self->next, fp); +} + int State_npoints(State *self) { return self->npts; } diff --git a/src/state.h b/src/state.h index 55d136b8..e54acd72 100644 --- a/src/state.h +++ b/src/state.h @@ -7,6 +7,7 @@ NameList *NameList_append(NameList *self, const char *name); void NameList_free(NameList *self); int NameList_size(NameList *self); +void NameList_print(NameList *self, FILE *fp); int State_npoints(State *self); int State_nparameters(State *self); State *State_new(int npts, int npar); From db2e4caa4b7c71075cfe08a026b8db2ea8cc07d8 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 6 Mar 2018 11:30:19 -0700 Subject: [PATCH 056/101] Version 1.6. Legofit allows "--stateIn" to be listed multiple times. Documentation now reflects this. --- src/legofit.c | 39 ++++++++++++++++++++++++++++----------- src/state.c | 8 +++++++- src/version.h | 2 +- 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/src/legofit.c b/src/legofit.c index 4a8637bf..81679446 100644 --- a/src/legofit.c +++ b/src/legofit.c @@ -58,10 +58,8 @@ algorithm maintains a swarm of points, each at a different set of parameter values. The objective function is evaluated at these points in a multithreaded job queue, so the program runs fasted on a machine with lots of cores. You can set the number of threads using the `-t` -argument. By default, the program uses as many threads as there are -processors on the machine---usually the number of hypercores. The -optimal number of threads is usually somewhat smaller that this -default. +argument. By default, the program uses 3/4 as many threads as there +are processors on the machine---usually the number of hypercores. The DE algorithm can be tuned via command line arguments `-F`, `-x`, `-s`, and `-p`. Details regarding these choices can be found in @@ -82,6 +80,32 @@ set up a simulation schedule with several stages. The algorithm is allowed to converge only during the final stage. I am currently using a 2-stage schedule: `-S 1000@10000 -S `1000@2000000`. +By default, the initial swarm of points consists of one point +representing the parameter values in the .lgo file, plus other points +scattered randomly throughout the feasible region of parameter +space. The total number of points defaults to 10 times the number of free +parameters. To change this number, see the --ptsPerDim option. + +The initial swarm of points can also be specified using the +`--stateIn` option. This reads a file specifying the initial state of +the swarm of points maintained by DE. The number of points and the +number of free parameters should agree with the values implied by the +.lgo file and the --ptsPerDim option. The format of this file is as +described below for the `--stateOut` option. + +The `--stateIn` option may be given more than once, each time with a +different input file. When more than one file is given, Legofit +constructs the initial swarm of points by combining points from all +input files. + +The option `--stateOut` is used to define an output file for the final +state of the optimizer. This output file begins with a row giving the +number of points and the number of free parameters. After that, there +is a row for each point in the swarm of points maintained by +diffev.c. In each row, the first entry is the value of the cost +function at that point. The remaining entries give the free parameter +values in the same order in which they are printed by legofit. + The `-1` option tells legofit to use singleton site patterns--patterns in which the derived allele is present in only a single sample. This is a bad idea with low-coverage sequence data. It also behaves poorly @@ -132,13 +156,6 @@ Second, you can relax the tolerance. By default, this is 1e-4. It is reported in the legofit output. To double this value, use "-T 2e-4" or "--tol 2e-4". -The option "--stateOut" is used to define an output file for the -final state of the optimizer. This output file contains a row for each -point in the swarm of points maintained by diffev.c. In each row, the -first entry is the value of the cost function at that point. The -remaining entries give the parameter values in the same order in which -they are printed by legofit. - @copyright Copyright (c) 2016, 2017, 2018, Alan R. Rogers . This file is released under the Internet Systems Consortium License, which can be found in file "LICENSE". diff --git a/src/state.c b/src/state.c index 51fc2df5..87dcad7e 100644 --- a/src/state.c +++ b/src/state.c @@ -23,8 +23,14 @@ NameList *NameList_append(NameList *self, const char *name) { CHECKMEM(self); self->name = strdup(name); self->next = NULL; - }else + }else { + if(0 == strcmp(name, self->name)) { + fprintf(stderr, "%s:%d: state file \"%s\" listed multiple times.\n", + __FILE__,__LINE__,name); + exit(EXIT_FAILURE); + } self->next = NameList_append(self->next, name); + } return self; } diff --git a/src/version.h b/src/version.h index ea891a41..30e43454 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.5" +#define VERSION "1.6" #endif From 4d6374b2bfff1de71a6b63f030fd61398567f8a9 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 6 Mar 2018 12:36:59 -0700 Subject: [PATCH 057/101] Modified state.c and legofit.c. When using multiple --stateIn options, it is now possible to use state files with differing numbers of points, and to request more points (using --ptsPerDim) than are given in any single --stateIn file. If the input files don't have as many points as requested, the number of points is revised downward. --- src/legofit.c | 14 +++++++------ src/state.c | 58 +++++++++++++++++++++++++++++++++++---------------- src/state.h | 2 +- 3 files changed, 49 insertions(+), 25 deletions(-) diff --git a/src/legofit.c b/src/legofit.c index 81679446..bbfa14d5 100644 --- a/src/legofit.c +++ b/src/legofit.c @@ -88,10 +88,10 @@ parameters. To change this number, see the --ptsPerDim option. The initial swarm of points can also be specified using the `--stateIn` option. This reads a file specifying the initial state of -the swarm of points maintained by DE. The number of points and the -number of free parameters should agree with the values implied by the -.lgo file and the --ptsPerDim option. The format of this file is as -described below for the `--stateOut` option. +the swarm of points maintained by DE. The format of this file is as +described below for the `--stateOut` option. The number of free +parameters in each `--stateIn` file should be as specified in the .lgo +file. The number of points in the files may differ. The `--stateIn` option may be given more than once, each time with a different input file. When more than one file is given, Legofit @@ -472,8 +472,10 @@ int main(int argc, char **argv) { State *state; if(stateInNames) { // read State from file - state = State_readList(stateInNames); + state = State_readList(stateInNames, npts, + GPTree_nFree(gptree)); CHECKMEM(state); + npts = State_npoints(state); } else { // de novo State state = State_new(npts, dim); @@ -498,7 +500,7 @@ int main(int argc, char **argv) { printf("# lgo input file : %s\n", lgofname); printf("# site pat input file: %s\n", patfname); printf("# free parameters : %d\n", dim); - printf("# pts/parameter : %d\n", ptsPerDim); + printf("# points in DE swarm : %d\n", npts); if(stateInNames) { printf("# input state file(s):"); NameList_print(stateInNames, stdout); diff --git a/src/state.c b/src/state.c index 87dcad7e..d58e8a86 100644 --- a/src/state.c +++ b/src/state.c @@ -214,14 +214,14 @@ int State_print(State *self, FILE *fp) { return EIO; } -State *State_readList(NameList *list) { +State *State_readList(NameList *list, int npts, int npar) { int nstates = NameList_size(list); if(nstates==0) return NULL; State *state[nstates]; NameList *node; - int i, j, k, npts=-1, npar; + int i, j, k; for(i=0, node=list; inext) { @@ -232,22 +232,26 @@ State *State_readList(NameList *list) { fclose(fp); // Make sure dimensions are compatible. - if(npts < 0) { - npts = State_npoints(state[i]); - npar = State_nparameters(state[i]); - }else{ - if(npts != State_npoints(state[i]) - || npar != State_nparameters(state[i])) { - fprintf(stderr,"%s:%s:%d:" - " input state file \"%s\" has" - " incompatible dimensions.\n", - __FILE__,__func__,__LINE__, - node->name); - exit(EXIT_FAILURE); - } + if(npar != State_nparameters(state[i])) { + fprintf(stderr,"%s:%s:%d:" + " input state file \"%s\" has" + " incompatible dimensions.\n", + __FILE__,__func__,__LINE__, + node->name); + exit(EXIT_FAILURE); } } + // Make sure we're not asking for more points than exist + int total=0; + for(i=0; i < nstates; ++i) + total += State_npoints(state[i]); + if(total < npts) + npts = total; + + // Number of points to skip + int skip = total - npts; + State *self = State_new(npts, npar); CHECKMEM(self); @@ -255,14 +259,32 @@ State *State_readList(NameList *list) { // The goal is to take nearly equal numbers. div_t qr = div(npts, nstates); int npoints[nstates]; - int total=0; // for debugging + int avail[nstates]; + int got=0; + + // 1st pass: roughly equal number of points per input file for(i=0; i < nstates; ++i) { + avail[i] = State_npoints(state[i]); npoints[i] = qr.quot; if(i < qr.rem) ++npoints[i]; - total += npoints[i]; // for debugging + + if(npoints[i] > avail[i]) + npoints[i] = avail[i]; + got += npoints[i]; + } + + // additional passes: fill in missing points + while(got < npts) { + for(i=0; i < nstates; ++i) { + if(npoints[i] < avail[i]) { + ++npoints[i]; + ++got; + } + if(got == npts) + break; + } } - assert(total == npts); // Set vectors; free pointers in vector "state". k = 0; // index into self->s diff --git a/src/state.h b/src/state.h index e54acd72..77c3bc66 100644 --- a/src/state.h +++ b/src/state.h @@ -18,6 +18,6 @@ void State_setVector(State *self, int ndx, int dim, double x[dim]); int State_getVector(State *self, int ndx, int dim, double x[dim]); void State_setCost(State *self, int ndx, double cost); double State_getCost(State *self, int ndx); -State *State_readList(NameList *list); +State *State_readList(NameList *list, int npts, int npar); #endif From 92701202c6aef5eecc602738a68a7e79a86de268 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 6 Mar 2018 12:42:39 -0700 Subject: [PATCH 058/101] Version 1.7. When revising the number of points, legofit now prints a message to stderr. --- src/legofit.c | 6 +++++- src/version.h | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/legofit.c b/src/legofit.c index bbfa14d5..eedf0a39 100644 --- a/src/legofit.c +++ b/src/legofit.c @@ -475,7 +475,11 @@ int main(int argc, char **argv) { state = State_readList(stateInNames, npts, GPTree_nFree(gptree)); CHECKMEM(state); - npts = State_npoints(state); + if(npts != State_npoints(state)) { + fprintf(stderr, "Revising npts from %d to %d\n", + npts, State_npoints(state)); + npts = State_npoints(state); + } } else { // de novo State state = State_new(npts, dim); diff --git a/src/version.h b/src/version.h index 30e43454..184215b3 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.6" +#define VERSION "1.7" #endif From 1a2a8500ae208a7b1d0688fa3a572f93012fa598 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 6 Mar 2018 13:51:33 -0700 Subject: [PATCH 059/101] After reading in --stateIn files to create a State object, Legofit now copies one of the parameter vectors from the State object into its GPTree object. This ensures that the initial parameter values printed by Legofit represent one of the parameter vectors that is actually used. --- src/legofit.c | 7 +++++++ src/version.h | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/legofit.c b/src/legofit.c index eedf0a39..baa1ecc5 100644 --- a/src/legofit.c +++ b/src/legofit.c @@ -480,6 +480,13 @@ int main(int argc, char **argv) { npts, State_npoints(state)); npts = State_npoints(state); } + + // Set gptree parameters from state array, so that + // initial parameter values, as printed, will represent + // one of the vectors in the state array. + double x[dim]; + State_getVector(state, 0, dim, x); + GPTree_setParams(gptree, dim, x); } else { // de novo State state = State_new(npts, dim); diff --git a/src/version.h b/src/version.h index 184215b3..8ac2d46b 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.7" +#define VERSION "1.8" #endif From 461a896ee366be13b6f06ec9e9a3110168fba056 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 6 Mar 2018 14:49:26 -0700 Subject: [PATCH 060/101] . --- src/state.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/state.c b/src/state.c index d58e8a86..449ac4da 100644 --- a/src/state.c +++ b/src/state.c @@ -249,9 +249,6 @@ State *State_readList(NameList *list, int npts, int npar) { if(total < npts) npts = total; - // Number of points to skip - int skip = total - npts; - State *self = State_new(npts, npar); CHECKMEM(self); From 0575bea710744295f1b667316f0e252ea1cc34bd Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 6 Mar 2018 17:52:01 -0700 Subject: [PATCH 061/101] cosmetic --- test/xstate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/test/xstate.c b/test/xstate.c index a459ea63..527b30d4 100644 --- a/test/xstate.c +++ b/test/xstate.c @@ -128,5 +128,6 @@ int main(int argc, char **argv) { unitTstResult("State", "OK"); unlink(fname); + unlink(fname2); return 0; } From 0ebc4eae706ad456d3890148820a1d1e430fca14 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Thu, 8 Mar 2018 11:12:26 -0700 Subject: [PATCH 062/101] Changed %lf to %lg in legofit output --- src/legofit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/legofit.c b/src/legofit.c index baa1ecc5..2d794a3a 100644 --- a/src/legofit.c +++ b/src/legofit.c @@ -504,9 +504,9 @@ int main(int argc, char **argv) { nThreads = npts; printf("# DE strategy : %d\n", strategy); - printf("# F : %lf\n", F); - printf("# CR : %lf\n", CR); - printf("# tolerance : %le\n", ytol); + printf("# F : %lg\n", F); + printf("# CR : %lg\n", CR); + printf("# tolerance : %lg\n", ytol); printf("# nthreads : %d\n", nThreads); printf("# lgo input file : %s\n", lgofname); printf("# site pat input file: %s\n", patfname); From 3c2e834f7084d3eef90c1cee78a2b57f35dc27d7 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Thu, 8 Mar 2018 17:53:00 -0700 Subject: [PATCH 063/101] Changed signal handling in Legofit. Legofit now handles 3 signals: SIGINT, SIGTERM, and SIGUSR1. In response to SIGINT or SIGTERM, legofit waits until the end of the current DE generation, then exits gracefully, printing all the usual output. In response to SIGUSR1, it waits until the end of the current DE generation, then writes to stderr a summary of the current state of the optimizer. --- src/diffev.c | 17 +++++++++++------ src/diffev.h | 1 + src/legofit.c | 14 +++++++++++++- src/version.h | 2 +- 4 files changed, 26 insertions(+), 8 deletions(-) diff --git a/src/diffev.c b/src/diffev.c index 502b1c7b..56fccb85 100644 --- a/src/diffev.c +++ b/src/diffev.c @@ -184,6 +184,11 @@ void sighandle(int signo) { sigstat = signo; } +/// SIGTERM is translated to SIGINT +void handleSIGTERM(int signo) { + sigstat = SIGINT; +} + // Strategies. // We have tried to come up with a sensible // naming-convention: DE/x/y/z @@ -680,7 +685,7 @@ int diffev(int dim, double estimate[dim], double *loCost, double *yspread, int stage, nstages = SimSched_nStages(simSched); for(stage=0; - sigstat==0 && stage < nstages; + sigstat!=SIGINT && stage < nstages; ++stage, SimSched_next(simSched)) { // The number of simulation replicates changes with each @@ -772,8 +777,10 @@ int diffev(int dim, double estimate[dim], double *loCost, double *yspread, *yspread = cmax - cmin; // output - if(verbose && gen % refresh == 0) { + if((verbose && gen % refresh == 0) + || sigstat==SIGUSR1) { // display after every refresh generations + fflush(stdout); fprintf(stderr, "%d:%d cost=%1.10lg yspread=%lf\n", stage, gen, cmin, *yspread); @@ -784,10 +791,8 @@ int diffev(int dim, double estimate[dim], double *loCost, double *yspread, putc(',', stderr); } putc('\n', stderr); -#if 0 - printState(nPts, dim, *pold, cost, imin, stdout); -#endif - fflush(stdout); + if(sigstat==SIGUSR1) + sigstat=0; } if(sigstat==SIGINT) break; diff --git a/src/diffev.h b/src/diffev.h index f785dda3..ecf1b691 100644 --- a/src/diffev.h +++ b/src/diffev.h @@ -39,5 +39,6 @@ int diffev(int dim, double estimate[dim], double *loCost, double *yspread, DiffEvPar dep, gsl_rng * rng); const char *diffEvStrategyLbl(int i); void sighandle(int signo); +void handleSIGTERM(int signo); #endif diff --git a/src/legofit.c b/src/legofit.c index 2d794a3a..0422614c 100644 --- a/src/legofit.c +++ b/src/legofit.c @@ -156,6 +156,17 @@ Second, you can relax the tolerance. By default, this is 1e-4. It is reported in the legofit output. To double this value, use "-T 2e-4" or "--tol 2e-4". +Legofit handles three types of signal: SIGINT, SIGTERM, and +SIGUSR1. If legofit is running in the foreground, the first of these +signals can be generated by typing Ctrl-C. Otherwise (under linux or +osx), use `killall -SIGINT legofit`, `killall -SIGTERM legofit`, or +`killall -SIGUSR1 legofit`. In response to SIGINT or SIGTERM, Legofit +will wait until the end of the current generation of the diffev +algorithm and then exit gracefully, printing all the usual output. In +response to SIGUSR1, Legofit will wait until the end of the current DE +generation and then write to stderr a summary of the state of the +optimizer. + @copyright Copyright (c) 2016, 2017, 2018, Alan R. Rogers . This file is released under the Internet Systems Consortium License, which can be found in file "LICENSE". @@ -186,7 +197,6 @@ Systems Consortium License, which can be found in file "LICENSE". extern pthread_mutex_t seedLock; extern unsigned long rngseed; -extern volatile sig_atomic_t sigstat; void usage(void); void *ThreadState_new(void *notused); @@ -248,6 +258,8 @@ int main(int argc, char **argv) { // Install handler for keyboard interrupts. signal(SIGINT, sighandle); + signal(SIGTERM, handleSIGTERM); + signal(SIGUSR1, sighandle); static struct option myopts[] = { /* {char *name, int has_arg, int *flag, int val} */ diff --git a/src/version.h b/src/version.h index 8ac2d46b..f292ddaa 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.8" +#define VERSION "1.9" #endif From 19530c8ed8177d03d65819a52469f2584bd4f722 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Thu, 8 Mar 2018 18:46:02 -0700 Subject: [PATCH 064/101] Added output to legofit to indicate whether job has terminated in response to a signal. --- src/legofit.c | 4 ++++ src/version.h | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/legofit.c b/src/legofit.c index 0422614c..d85763f3 100644 --- a/src/legofit.c +++ b/src/legofit.c @@ -197,6 +197,7 @@ Systems Consortium License, which can be found in file "LICENSE". extern pthread_mutex_t seedLock; extern unsigned long rngseed; +extern volatile sig_atomic_t sigstat; void usage(void); void *ThreadState_new(void *notused); @@ -622,6 +623,9 @@ int main(int argc, char **argv) { status = diffev(dim, estimate, &cost, &yspread, dep, rng); + if(sigstat == SIGINT) + printf("Job terminated early in reponse to signal.\n"); + printf("DiffEv %s. cost=%0.5le spread=%0.5le\n", status == 0 ? "converged" : "FAILED", cost, yspread); #if COST==LNL_COST diff --git a/src/version.h b/src/version.h index f292ddaa..49b0b22d 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.9" +#define VERSION "1.10" #endif From 9c36ba5aba04e0cce1d1ecf2cd316119d3df67be Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sun, 18 Mar 2018 14:54:25 -0600 Subject: [PATCH 065/101] Add axt2ref and test.axt --- src/Makefile | 2 +- src/axt2raf.py | 179 +++++++++++++++++++++++++++++++++++++++++++++++++ test/test.axt | 19 ++++++ 3 files changed, 199 insertions(+), 1 deletion(-) create mode 100755 src/axt2raf.py create mode 100644 test/test.axt diff --git a/src/Makefile b/src/Makefile index f8459615..ead43dbf 100644 --- a/src/Makefile +++ b/src/Makefile @@ -8,7 +8,7 @@ prof := incl := -I/usr/local/include -I/opt/local/include targets := legosim legofit tabpat sitepat scrmpat daf raf numcores -pytargets := diverg.py bootci.py flatfile.py +pytargets := diverg.py bootci.py flatfile.py axt2raf.py tests := xzeroin xbinary CC := gcc diff --git a/src/axt2raf.py b/src/axt2raf.py new file mode 100755 index 00000000..f883f8ec --- /dev/null +++ b/src/axt2raf.py @@ -0,0 +1,179 @@ +#!/usr/bin/python +from string import maketrans +import sys +import datetime + +# Print usage message and abort +def usage(): + msg = \ + """ +usage: axt2raf.py + +where is in axt format. Writes to standard output. +""" + sys.stdout.flush() + print >> sys.stderr, msg + exit(1) + +# For converting from negative strant to positive strand +nucleotides = "atgc" +complements = "tacg" +trtab = maketrans(nucleotides, complements) + +class SortError(Exception): + """ Exception for unsorted input """ + pass + +class Align: + def __init__(self): + self.initialized = False + + def read(self,infile): + while True: + # read until we get a non-blank line + line=f.readline() + if line == '': + self.initialized = False + return self + line = line.strip().split() + if len(line) > 0: + break + sA=f.readline() + sB=f.readline() + if sB == '': + self.initialized = False # signal end of file + return self + sA = sA.strip().lower() + sB = sB.strip().lower() + + self.alignment = int(line[0]) + self.chr=line[1] + self.start = int(line[2]) # start position + self.end = int(line[3])+1 # 1 past last position + + # If we're on the negative strand, then translate + # sA and sB by complementing each nucleotide. + # See definition of trtab above. + strand = line[7] + if strand == "-": + sA = sA.translate(trtab) + sB = sB.translate(trtab) + + self.qual = int(line[8]) + + n = len(sA) + assert n == len(sB) + gaps = 0 + for i in range(n): + if sA[i]=="-" or sB[i]=="-": + gaps += 1 + n -= gaps + self.ref = n * [None] + self.alt = n * [None] + self.raf = n * [None] + + # i indexes ref, alt, and p + # j indexes sA and sB + i = j = 0 + while i < n: + if sA[j] in "atgc" and sB[j] in "atgc": + self.ref[i] = sA[j] + if sA[j] == sB[j]: + self.alt[i] = "." + self.raf[i] = 1.0 + else: + self.alt[i] = sB[j] + self.raf[i] = 0.0 + i += 1 + j += 1 + + self.initialized = True + return self + + # Print alignment + def pr(self): + print "# Alignment %d: [%d, %d)" % \ + (self.alignment, self.start, self.end) + pos = self.start + for i in range(len(self.ref)): + print "%s\t%d\t%s\t%s\t%f" % (self.chr, pos, self.ref[i],\ + self.alt[i], self.raf[i]) + pos = pos + 1 + self.initialized = False + return self + + # Define "+=" operator, which merges two alignments + def __iadd__(self, other): + if not self.initialized: + raise ValueError, "Align object not initialized" + if not other.initialized: + raise ValueError, "Align object not initialized" + if self.chr != other.chr: + raise ValueError, "Chromosomes don't match" + if self.start > other.start: + raise ValueError, "Start position of lhs exceeds rhs" + if other.start > self.end: + raise ValueError, "Alignments don't overlap" + if other.start == self.end: + self.s += other.s + self.end = other.end + if other.end < self.end: + # other is nested within self: do nothing + return self + else: + n = other.start - self.start + self.ref = self.ref[0:n] + other.ref + self.alt = self.alt[0:n] + other.alt + self.raf = self.raf[0:n] + other.raf + self.initialized = True + other.initialized = False + return self + +# Do two alignments overlap? +def overlap(a, b): + assert a.start <= b.start + if a.chr == b.chr and b.start < a.end: + return True + else: + return False + +if len(sys.argv) != 2: + usage() + +try: + f=open(sys.argv[1]) +except: + sys.stdout.flush() + print >> sys.stderr, "Can't open input file \"%s\"" % sys.argv[1] + exit(1) + +a = Align() +b = Align() + +a.read(f) +if a.initialized == False: + sys.stdout.flush() + print >> sys.stderr, "Can't read 1st alignment" + exit(1) + +print "#%s\t%s\t%s\t%s\t%s" % ("chr", "pos", "ref", "alt", "raf") +while True: + b.read(f) + if b.initialized == False: + break + if a.start > b.start: + sys.stdout.flush() + print >> sys.stderr, \ + "Start positions missorted: %d > %d" % (a.start, b.start) + exit(1) + if overlap(a, b): + a += b + else: + a.pr() + a, b = b, a # swap a and b + +if a.initialized: + a.pr() + +if b.initialized: + b.pr() diff --git a/test/test.axt b/test/test.axt new file mode 100644 index 00000000..772e4a44 --- /dev/null +++ b/test/test.axt @@ -0,0 +1,19 @@ +0 chr22 001 004 chrUn 401 404 + 473100 +G--ATC +gat--G + +1 chr22 002 003 chr22 402 403 + 93819 +A--A +-AG- + +2 chr22 005 007 chr22 505 507 + 93819 +CTC- +CT-A + +3 chr22 007 009 chrUn 607 609 + 473100 +G-AT +ga-a + +4 chr22 10 11 chrUn 100 101 + 473100 +GA +ga From e15736df6942b0f3043cc878f13c1da8e4962a64 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sun, 18 Mar 2018 18:13:05 -0600 Subject: [PATCH 066/101] axt2raf.py works --- src/axt2raf.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/axt2raf.py b/src/axt2raf.py index f883f8ec..f934e5c1 100755 --- a/src/axt2raf.py +++ b/src/axt2raf.py @@ -15,7 +15,7 @@ def usage(): print >> sys.stderr, msg exit(1) -# For converting from negative strant to positive strand +# For converting from negative strand to positive strand nucleotides = "atgc" complements = "tacg" trtab = maketrans(nucleotides, complements) @@ -24,7 +24,7 @@ class SortError(Exception): """ Exception for unsorted input """ pass -class Align: +class Alignment: def __init__(self): self.initialized = False @@ -63,10 +63,7 @@ def read(self,infile): n = len(sA) assert n == len(sB) - gaps = 0 - for i in range(n): - if sA[i]=="-" or sB[i]=="-": - gaps += 1 + gaps = sA.count("-") n -= gaps self.ref = n * [None] self.alt = n * [None] @@ -76,11 +73,13 @@ def read(self,infile): # j indexes sA and sB i = j = 0 while i < n: - if sA[j] in "atgc" and sB[j] in "atgc": + if sA[j] != "-": self.ref[i] = sA[j] if sA[j] == sB[j]: self.alt[i] = "." self.raf[i] = 1.0 + elif sB[j] == "-": + self.alt[i] = "-" # deletion in sB else: self.alt[i] = sB[j] self.raf[i] = 0.0 @@ -92,12 +91,13 @@ def read(self,infile): # Print alignment def pr(self): - print "# Alignment %d: [%d, %d)" % \ - (self.alignment, self.start, self.end) + #print "# Alignment %d: [%d, %d)" % \ + # (self.alignment, self.start, self.end) pos = self.start for i in range(len(self.ref)): - print "%s\t%d\t%s\t%s\t%f" % (self.chr, pos, self.ref[i],\ - self.alt[i], self.raf[i]) + if self.alt[i] != "-": # omit deletions + print "%s\t%d\t%s\t%s\t%f" % (self.chr, pos, self.ref[i],\ + self.alt[i], self.raf[i]) pos = pos + 1 self.initialized = False return self @@ -105,9 +105,9 @@ def pr(self): # Define "+=" operator, which merges two alignments def __iadd__(self, other): if not self.initialized: - raise ValueError, "Align object not initialized" + raise ValueError, "Alignment not initialized" if not other.initialized: - raise ValueError, "Align object not initialized" + raise ValueError, "Alignment not initialized" if self.chr != other.chr: raise ValueError, "Chromosomes don't match" if self.start > other.start: @@ -147,8 +147,8 @@ def overlap(a, b): print >> sys.stderr, "Can't open input file \"%s\"" % sys.argv[1] exit(1) -a = Align() -b = Align() +a = Alignment() +b = Alignment() a.read(f) if a.initialized == False: From 501d397563f4681ec07a8c3b6df8f2ab19043bbf Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Mon, 19 Mar 2018 12:42:29 -0600 Subject: [PATCH 067/101] Fixed bugs in axt2raf.py. The old code didn't handle missing values ("N" or "n" in input), and it didn't check that the lengths of sequences agree with each other and with the header of each alignment. --- src/axt2raf.py | 60 ++++++++++++++++++++++++++++++++++++++++---------- test/test.axt | 4 ++++ 2 files changed, 52 insertions(+), 12 deletions(-) diff --git a/src/axt2raf.py b/src/axt2raf.py index f934e5c1..b49f6dd5 100755 --- a/src/axt2raf.py +++ b/src/axt2raf.py @@ -45,12 +45,48 @@ def read(self,infile): return self sA = sA.strip().lower() sB = sB.strip().lower() + lenA = len(sA) + lenB = len(sB) self.alignment = int(line[0]) self.chr=line[1] - self.start = int(line[2]) # start position - self.end = int(line[3])+1 # 1 past last position - + self.start = int(line[2]) # start position, seq A + self.end = int(line[3])+1 # 1 past last position, seq A + + # lengths of seqA and seqB should match + if lenA != lenB: + sys.stdout.flush() + print >> sys.stderr, \ + "length mismatch in alignment %d" \ + % self.alignment + print >> sys.stderr, "lenA=%d but lenB=%d" \ + % (lenA, lenB) + exit(1) + + # After omitting gaps, length of seqA should match header + netA = lenA - sA.count("-") + if netA != self.end - self.start: + sys.stdout.flush() + print >> sys.stderr, \ + "non-gap length mismatch: seqA and header in alignment %d" \ + % self.alignment + print >> sys.stderr, "header=%d but netA=%d" \ + % (self.end - self.start, netA) + exit(1) + + # After omitting gaps, length of seqB should match header + netB = lenB - sB.count("-") + startB = int(line[5]) # start, seq B + endB = int(line[6])+1 # end, seq B + if netB != endB - startB: + sys.stdout.flush() + print >> sys.stderr, \ + "non-gap length mismatch: seqB and header in alignment %d" \ + % self.alignment + print >> sys.stderr, "header=%d but netB=%d" \ + % (endB - startB, netA) + exit(1) + # If we're on the negative strand, then translate # sA and sB by complementing each nucleotide. # See definition of trtab above. @@ -61,18 +97,15 @@ def read(self,infile): self.qual = int(line[8]) - n = len(sA) - assert n == len(sB) - gaps = sA.count("-") - n -= gaps - self.ref = n * [None] - self.alt = n * [None] - self.raf = n * [None] + # netA is length of output vectors + self.ref = netA * [None] + self.alt = netA * [None] + self.raf = netA * [None] # i indexes ref, alt, and p # j indexes sA and sB i = j = 0 - while i < n: + while i < netA: if sA[j] != "-": self.ref[i] = sA[j] if sA[j] == sB[j]: @@ -80,6 +113,8 @@ def read(self,infile): self.raf[i] = 1.0 elif sB[j] == "-": self.alt[i] = "-" # deletion in sB + elif sB[j] == "n": + self.alt[i] = sB[j] else: self.alt[i] = sB[j] self.raf[i] = 0.0 @@ -95,7 +130,8 @@ def pr(self): # (self.alignment, self.start, self.end) pos = self.start for i in range(len(self.ref)): - if self.alt[i] != "-": # omit deletions + # omit deletions and missing values + if self.alt[i] not in "-n" and self.ref[i] not in "-n": print "%s\t%d\t%s\t%s\t%f" % (self.chr, pos, self.ref[i],\ self.alt[i], self.raf[i]) pos = pos + 1 diff --git a/test/test.axt b/test/test.axt index 772e4a44..790cbe66 100644 --- a/test/test.axt +++ b/test/test.axt @@ -17,3 +17,7 @@ ga-a 4 chr22 10 11 chrUn 100 101 + 473100 GA ga + +5 chr23 13 17 chr17 101 104 + 473100 +GATna +at-tn From ae8a3fb11281cb9a82e7e7b013e5edb8b4eb7a2d Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Mon, 19 Mar 2018 14:53:18 -0600 Subject: [PATCH 068/101] version 1.11 --- src/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/version.h b/src/version.h index 49b0b22d..3ef59afc 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.10" +#define VERSION "1.11" #endif From b9d1a99e62fe9307080c9e3f126d2cf0ea8f9883 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Mon, 19 Mar 2018 15:36:12 -0600 Subject: [PATCH 069/101] Wrote mergeraf.c. Compiles but is untested. --- src/Makefile | 10 +++-- src/mergeraf.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++++ src/rafreader.h | 13 +++++- 3 files changed, 128 insertions(+), 4 deletions(-) create mode 100644 src/mergeraf.c diff --git a/src/Makefile b/src/Makefile index ead43dbf..c4f50af3 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,13 +1,13 @@ destination := $(HOME)/bin global_destination := $(HOME)/group/bin -opt := -DNDEBUG -O3 -finline-functions # For full optimization -#opt := -O0 -fno-inline-functions -rdynamic -DDEBUG # For debugging +#opt := -DNDEBUG -O3 -finline-functions # For full optimization +opt := -O0 -fno-inline-functions -rdynamic -DDEBUG # For debugging #opt := -O3 -finline-functions # Optimization + debugging #prof := -pg -rdynamic # For profiling prof := incl := -I/usr/local/include -I/opt/local/include -targets := legosim legofit tabpat sitepat scrmpat daf raf numcores +targets := legosim legofit tabpat sitepat scrmpat daf raf numcores mergeraf pytargets := diverg.py bootci.py flatfile.py axt2raf.py tests := xzeroin xbinary @@ -66,6 +66,10 @@ SITEPAT := sitepat.o misc.o binary.o lblndx.o parkeyval.o rafreader.o \ sitepat : $(SITEPAT) $(CC) $(CFLAGS) -o $@ $(SITEPAT) $(lib) +MERGERAF := mergeraf.o rafreader.o error.o tokenizer.o misc.o +mergeraf : $(MERGERAF) + $(CC) $(CFLAGS) -o $@ $(MERGERAF) $(lib) + SCRMPAT := scrmpat.o misc.o binary.o lblndx.o parkeyval.o scrmreader.o \ tokenizer.o boot.o error.o scrmpat : $(SCRMPAT) diff --git a/src/mergeraf.c b/src/mergeraf.c new file mode 100644 index 00000000..6be887e3 --- /dev/null +++ b/src/mergeraf.c @@ -0,0 +1,109 @@ +/** +@file mergeraf.c +@page mergeraf +@brief Merge two or more raf files + +# Mergeraf: merge two or more raf files + +Sitepat reads several files in .raf format and prints a single raf file +to standard output. The output includes only those positions at which +chromosome, position, ref, and alt match in all the input +files. (Missing values in alt are allowed.) In the output file, the +reference allele frequency (raf) is the unweighted average of those in +the input files. + +# Usage + +@copyright Copyright (c) 2018, Alan R. Rogers +. This file is released under the Internet +Systems Consortium License, which can be found in file "LICENSE". +*/ + +#include "rafreader.h" +#include "error.h" +#include "typedefs.h" +#include "version.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAXCHR 24 // maximum number of chromosomes + +static void usage(void); + +const char *useMsg = + "\nUsage: mergeraf ...\n" + " where are input files in raf format." + " Writes to standard output.\n"; + +/// Print usage message and die. +static void usage(void) { + fputs(useMsg, stderr); + exit(1); +} + +int main(int argc, char **argv) { + int i, status, done; + char errbuff[100] = { '\0' }; + + // Each command line argument is an input file name + int n = argc - 1; // number of input files + if(n == 0) + usage(); + + RAFReader *r[n]; + + // Each argument should be the name of an input file. + for(i = 0; i < n; ++i) + r[i] = RAFReader_new(argv[i+1]); + + printf("# mergeraf version %s\n", VERSION); + + // Iterate through raf files + printf("#%s\t%s\t%s\t%s\t%s\n", "chr", "pos", "ref", "alt", "raf"); + RAFReader_clearChromosomes(n, r); + done=0; + while( !done ) { + status = RAFReader_multiNext(n, r); + switch(status) { + case 0: + break; + case EOF: + done=1; + continue; + case REF_MISMATCH: + case MULTIPLE_ALT: + case NO_ANCESTRAL_ALLELE: + continue; + default: + // something wrong. + mystrerror_r(status, errbuff, sizeof errbuff); + fprintf(stderr,"%s:%d: input error (%s)\n", + __FILE__,__LINE__, errbuff); + exit(EXIT_FAILURE); + } + + // p is the average frequency of the reference allele + double p=0.0; + for(i = 0; i < n; ++i) + p += RAFReader_raf(r[i]); + p /= n; + + printf("%s\t%lu\t%s\t%s\t%0.18g\n", + RAFReader_chr(r[0]), + RAFReader_nucpos(r[0]), + RAFReader_ref(r[0]), + RAFReader_alt(r[0]), + p); + } + + for(i = 0; i < n; ++i) + RAFReader_free(r[i]); + return 0; +} diff --git a/src/rafreader.h b/src/rafreader.h index ad938d6d..e4ee4906 100644 --- a/src/rafreader.h +++ b/src/rafreader.h @@ -35,13 +35,24 @@ int RAFReader_rewind(RAFReader *self); int RAFReader_multiNext(int n, RAFReader * r[n]); static inline const char *RAFReader_chr(RAFReader *self); static inline unsigned long RAFReader_nucpos(RAFReader *self); - +static inline const char *RAFReader_ref(RAFReader *self); +static inline const char *RAFReader_alt(RAFReader *self); /// Return const pointer to label of current chromosome. static inline const char *RAFReader_chr(RAFReader *self) { return self->chr; } +/// Return const pointer to label of current reference allele +static inline const char *RAFReader_ref(RAFReader *self) { + return self->ref; +} + +/// Return const pointer to label of current alternate allele +static inline const char *RAFReader_alt(RAFReader *self) { + return self->alt; +} + /// Return position of current nucleotide site static inline unsigned long RAFReader_nucpos(RAFReader *self) { return self->nucpos; From 34c3a25cd5c22d440e2f1662756a6d2155f07a79 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Mon, 19 Mar 2018 16:01:04 -0600 Subject: [PATCH 070/101] Changed RAFReader_multiNext, which required changes in sitepat.c and mergeraf.c. Not yet tested. In the old code, RAFReader_multiNext not only advanced all readers to the next shared position, it also calculated the derived allele frequency, assuming that the final reader represents the outgroup. With that setup, it was impossible to use the code in mergedaf.c, which doesn't need the daf and doesn't define an outgroup. In the new code, a separate function (RAFReader_findDaf) is used to calculate the derived allele frequency, and RAFReader_multiNext doesn't do this job. Thus, mergedaf can use RAFReader_multiNext by itself, but sitepat must call both functions, one after the other. --- src/mergeraf.c | 1 - src/rafreader.c | 16 ++++++++++++++++ src/rafreader.h | 1 + src/sitepat.c | 14 ++++++++++++++ 4 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/mergeraf.c b/src/mergeraf.c index 6be887e3..6b78eaa2 100644 --- a/src/mergeraf.c +++ b/src/mergeraf.c @@ -79,7 +79,6 @@ int main(int argc, char **argv) { continue; case REF_MISMATCH: case MULTIPLE_ALT: - case NO_ANCESTRAL_ALLELE: continue; default: // something wrong. diff --git a/src/rafreader.c b/src/rafreader.c index 928875df..0d43881d 100644 --- a/src/rafreader.c +++ b/src/rafreader.c @@ -284,6 +284,21 @@ int RAFReader_multiNext(int n, RAFReader * r[n]) { } } while(!onSameChr || minnuc != maxnuc); + // Make sure REF and ALT are consistent across readers + if((status = RAFReader_alleleCheck(n, r))) + return status; + + return 0; +} + +/// Set derived allele frequency within each RAFReader. +/// @param[in] n number of RAFReader objects in array +/// @param[in] r array of RAFReader objects. Last one should be outgroup. +/// @return 0 on success, or one of several error codes on failure. +int RAFReader_findDaf(int n, RAFReader * r[n]) { + + int i, status; + // Make sure reference allele isn't fixed in readers, excluding // the outgroup (reader n-1). If it's fixed, then we can't call // the ancestral allele. @@ -319,6 +334,7 @@ int RAFReader_multiNext(int n, RAFReader * r[n]) { return 0; } + /// Return 0 if ref and alt alleles of all readers match; return /// REF_MISMATCH if there is a mismatch in REF alleles; return /// MULTIPLE_ALT if there is a mismatch in ALT alleles. diff --git a/src/rafreader.h b/src/rafreader.h index e4ee4906..0f7a43ba 100644 --- a/src/rafreader.h +++ b/src/rafreader.h @@ -33,6 +33,7 @@ void RAFReader_print(RAFReader *r, FILE *fp); void RAFReader_printArray(int n, RAFReader * r[n], FILE *fp); int RAFReader_rewind(RAFReader *self); int RAFReader_multiNext(int n, RAFReader * r[n]); +int RAFReader_findDaf(int n, RAFReader * r[n]); static inline const char *RAFReader_chr(RAFReader *self); static inline unsigned long RAFReader_nucpos(RAFReader *self); static inline const char *RAFReader_ref(RAFReader *self); diff --git a/src/sitepat.c b/src/sitepat.c index 43fc8198..408a5b90 100644 --- a/src/sitepat.c +++ b/src/sitepat.c @@ -453,6 +453,20 @@ int main(int argc, char **argv) { continue; case REF_MISMATCH: case MULTIPLE_ALT: + continue; + default: + // something wrong. + mystrerror_r(status, errbuff, sizeof errbuff); + fprintf(stderr,"%s:%d: input error (%s)\n", + __FILE__,__LINE__, errbuff); + exit(EXIT_FAILURE); + } + + // Set derived allele frequence in each reader + status = RAFReader_findDaf(n, r); + switch(status) { + case 0: + break; case NO_ANCESTRAL_ALLELE: continue; default: From af611ceccd7ea157241b0759dfacd3eb31838418 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 20 Mar 2018 09:09:30 -0600 Subject: [PATCH 071/101] Renamed "mergeraf" as "joinraf". --- src/Makefile | 8 ++++---- src/{mergeraf.c => joinraf.c} | 10 +++++----- test/xrafreader.c | 4 +++- 3 files changed, 12 insertions(+), 10 deletions(-) rename src/{mergeraf.c => joinraf.c} (94%) diff --git a/src/Makefile b/src/Makefile index c4f50af3..1a6aba3f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -7,7 +7,7 @@ opt := -O0 -fno-inline-functions -rdynamic -DDEBUG # For debugging prof := incl := -I/usr/local/include -I/opt/local/include -targets := legosim legofit tabpat sitepat scrmpat daf raf numcores mergeraf +targets := legosim legofit tabpat sitepat scrmpat daf raf numcores joinraf pytargets := diverg.py bootci.py flatfile.py axt2raf.py tests := xzeroin xbinary @@ -66,9 +66,9 @@ SITEPAT := sitepat.o misc.o binary.o lblndx.o parkeyval.o rafreader.o \ sitepat : $(SITEPAT) $(CC) $(CFLAGS) -o $@ $(SITEPAT) $(lib) -MERGERAF := mergeraf.o rafreader.o error.o tokenizer.o misc.o -mergeraf : $(MERGERAF) - $(CC) $(CFLAGS) -o $@ $(MERGERAF) $(lib) +JOINRAF := joinraf.o rafreader.o error.o tokenizer.o misc.o +joinraf : $(JOINRAF) + $(CC) $(CFLAGS) -o $@ $(JOINRAF) $(lib) SCRMPAT := scrmpat.o misc.o binary.o lblndx.o parkeyval.o scrmreader.o \ tokenizer.o boot.o error.o diff --git a/src/mergeraf.c b/src/joinraf.c similarity index 94% rename from src/mergeraf.c rename to src/joinraf.c index 6b78eaa2..5dfa0b3e 100644 --- a/src/mergeraf.c +++ b/src/joinraf.c @@ -1,9 +1,9 @@ /** -@file mergeraf.c -@page mergeraf +@file joinraf.c +@page joinraf @brief Merge two or more raf files -# Mergeraf: merge two or more raf files +# Joinraf: merge two or more raf files Sitepat reads several files in .raf format and prints a single raf file to standard output. The output includes only those positions at which @@ -38,7 +38,7 @@ Systems Consortium License, which can be found in file "LICENSE". static void usage(void); const char *useMsg = - "\nUsage: mergeraf ...\n" + "\nUsage: joinraf ...\n" " where are input files in raf format." " Writes to standard output.\n"; @@ -63,7 +63,7 @@ int main(int argc, char **argv) { for(i = 0; i < n; ++i) r[i] = RAFReader_new(argv[i+1]); - printf("# mergeraf version %s\n", VERSION); + printf("# joinraf version %s\n", VERSION); // Iterate through raf files printf("#%s\t%s\t%s\t%s\t%s\n", "chr", "pos", "ref", "alt", "raf"); diff --git a/test/xrafreader.c b/test/xrafreader.c index b6229aa4..40d87e55 100644 --- a/test/xrafreader.c +++ b/test/xrafreader.c @@ -132,6 +132,8 @@ int main(int argc, char **argv) { i=0; do{ status = RAFReader_multiNext(3, r); + if(status==0) + status = RAFReader_findDaf(3, r); if(i==0) { assert(status==0); assert(0 == strcmp("1",RAFReader_chr(r[0]))); @@ -159,7 +161,7 @@ int main(int argc, char **argv) { } ++i; }while(status != EOF); - + for(i = 0; i < 3; ++i) { RAFReader_free(r[i]); remove(tst[i]); From 4a25d94a90d0d020eb4d229146a2815115a03fd8 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 20 Mar 2018 13:28:21 -0600 Subject: [PATCH 072/101] axt2raf.py now handles comments, which are lines beginning with "#". --- src/axt2raf.py | 2 ++ test/test.axt | 1 + 2 files changed, 3 insertions(+) diff --git a/src/axt2raf.py b/src/axt2raf.py index b49f6dd5..19d6eb13 100755 --- a/src/axt2raf.py +++ b/src/axt2raf.py @@ -37,6 +37,8 @@ def read(self,infile): return self line = line.strip().split() if len(line) > 0: + if line[0] == "#": + continue break sA=f.readline() sB=f.readline() diff --git a/test/test.axt b/test/test.axt index 790cbe66..2fe46682 100644 --- a/test/test.axt +++ b/test/test.axt @@ -1,3 +1,4 @@ +# Header: to be ignored 0 chr22 001 004 chrUn 401 404 + 473100 G--ATC gat--G From fd93367c75ba86a8cfe4b1f94bc48fbbbc121cbe Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 20 Mar 2018 13:31:16 -0600 Subject: [PATCH 073/101] Optimization on. --- src/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Makefile b/src/Makefile index 1a6aba3f..57a62efc 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,7 +1,7 @@ destination := $(HOME)/bin global_destination := $(HOME)/group/bin -#opt := -DNDEBUG -O3 -finline-functions # For full optimization -opt := -O0 -fno-inline-functions -rdynamic -DDEBUG # For debugging +opt := -DNDEBUG -O3 -finline-functions # For full optimization +#opt := -O0 -fno-inline-functions -rdynamic -DDEBUG # For debugging #opt := -O3 -finline-functions # Optimization + debugging #prof := -pg -rdynamic # For profiling prof := From 17ea573c93082b63ee49b71593523e8eff37d4ed Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 20 Mar 2018 13:32:07 -0600 Subject: [PATCH 074/101] Version 1.12 --- src/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/version.h b/src/version.h index 3ef59afc..c73f6451 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.11" +#define VERSION "1.12" #endif From a5e1e9f567a4661acb54c216d25bbb7c928a967c Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 20 Mar 2018 13:32:45 -0600 Subject: [PATCH 075/101] . --- src/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Makefile b/src/Makefile index c4f50af3..749db83b 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,7 +1,7 @@ destination := $(HOME)/bin global_destination := $(HOME)/group/bin -#opt := -DNDEBUG -O3 -finline-functions # For full optimization -opt := -O0 -fno-inline-functions -rdynamic -DDEBUG # For debugging +opt := -DNDEBUG -O3 -finline-functions # For full optimization +#opt := -O0 -fno-inline-functions -rdynamic -DDEBUG # For debugging #opt := -O3 -finline-functions # Optimization + debugging #prof := -pg -rdynamic # For profiling prof := From c65c5defbe7c4a779269fa4a52e449f41a0fd283 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 20 Mar 2018 14:00:06 -0600 Subject: [PATCH 076/101] Modified test/test.axt --- test/test.axt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test.axt b/test/test.axt index 2fe46682..1dd19cee 100644 --- a/test/test.axt +++ b/test/test.axt @@ -4,8 +4,8 @@ G--ATC gat--G 1 chr22 002 003 chr22 402 403 + 93819 -A--A --AG- +A-A +-AT 2 chr22 005 007 chr22 505 507 + 93819 CTC- From 385e40887b314293f79677fc8cac393d5cba9f3a Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 20 Mar 2018 14:01:43 -0600 Subject: [PATCH 077/101] . --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 265c0f17..040d9af1 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ xrafreader xmisc xscrmreader xstrint +joinraf legosim numcores legofit From 6642bc560da27b93894a27fd6622f85c0079eeb4 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 20 Mar 2018 14:10:29 -0600 Subject: [PATCH 078/101] . --- src/joinraf.c | 3 +-- src/version.h | 2 +- test/xrafreader.c | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/joinraf.c b/src/joinraf.c index 5dfa0b3e..ee03f305 100644 --- a/src/joinraf.c +++ b/src/joinraf.c @@ -5,7 +5,7 @@ # Joinraf: merge two or more raf files -Sitepat reads several files in .raf format and prints a single raf file +Joinraf reads several files in .raf format and prints a single raf file to standard output. The output includes only those positions at which chromosome, position, ref, and alt match in all the input files. (Missing values in alt are allowed.) In the output file, the @@ -67,7 +67,6 @@ int main(int argc, char **argv) { // Iterate through raf files printf("#%s\t%s\t%s\t%s\t%s\n", "chr", "pos", "ref", "alt", "raf"); - RAFReader_clearChromosomes(n, r); done=0; while( !done ) { status = RAFReader_multiNext(n, r); diff --git a/src/version.h b/src/version.h index c73f6451..8f45564a 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.12" +#define VERSION "1.13" #endif diff --git a/test/xrafreader.c b/test/xrafreader.c index 40d87e55..fe70d5d5 100644 --- a/test/xrafreader.c +++ b/test/xrafreader.c @@ -164,7 +164,7 @@ int main(int argc, char **argv) { for(i = 0; i < 3; ++i) { RAFReader_free(r[i]); - remove(tst[i]); + //remove(tst[i]); } unitTstResult("RAFReader", "OK"); From fee25dae17bd1a4c3e398a4039eccf54c2d61623 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 20 Mar 2018 23:03:28 -0600 Subject: [PATCH 079/101] Fixed input code in axt2raf.py. --- src/axt2raf.py | 14 +++++++------- src/version.h | 2 +- test/test.axt | 3 +++ 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/axt2raf.py b/src/axt2raf.py index 19d6eb13..2b35a82f 100755 --- a/src/axt2raf.py +++ b/src/axt2raf.py @@ -32,14 +32,14 @@ def read(self,infile): while True: # read until we get a non-blank line line=f.readline() - if line == '': + if line == '': # end of file self.initialized = False return self - line = line.strip().split() - if len(line) > 0: - if line[0] == "#": - continue - break + line = line.strip() + if len(line)==0 or line[0] == "#": # blank line or comment + continue + break # non-blank, non-comment + line = line.split() sA=f.readline() sB=f.readline() if sB == '': @@ -88,7 +88,7 @@ def read(self,infile): print >> sys.stderr, "header=%d but netB=%d" \ % (endB - startB, netA) exit(1) - + # If we're on the negative strand, then translate # sA and sB by complementing each nucleotide. # See definition of trtab above. diff --git a/src/version.h b/src/version.h index 8f45564a..a7e5b541 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.13" +#define VERSION "1.14" #endif diff --git a/test/test.axt b/test/test.axt index 1dd19cee..5f520e94 100644 --- a/test/test.axt +++ b/test/test.axt @@ -1,4 +1,7 @@ # Header: to be ignored +# Header: to be ignored +# Header: to be ignored +# Header: to be ignored 0 chr22 001 004 chrUn 401 404 + 473100 G--ATC gat--G From b92b804d7579ddbaee750698448bb13c37fdb8f4 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sat, 24 Mar 2018 11:52:18 -0600 Subject: [PATCH 080/101] Version 1.15. Changed default parameters of differential evolution within legofit.c. These are now "strategy=2", "F=0.3", and "CR=0.8". --- src/legofit.c | 9 ++++----- src/version.h | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/legofit.c b/src/legofit.c index d85763f3..20e4c7b2 100644 --- a/src/legofit.c +++ b/src/legofit.c @@ -64,9 +64,8 @@ are processors on the machine---usually the number of hypercores. The DE algorithm can be tuned via command line arguments `-F`, `-x`, `-s`, and `-p`. Details regarding these choices can be found in "Differential evolution: a practical approach to global optimization", -by Price, Storn, and Lampinen. I don't yet know what is best, but I'm -currently using `-s 2` and `-s 4`, with default values of the other -options. +by Price, Storn, and Lampinen. We've had good results with `-s 2`, `-F +0.3`, and `-x 0.8`, so these became the defaults as of version 1.15. During the first few hundred generations of the DE algorithm, the swarm of points adapts to the objective function. During this initial @@ -302,14 +301,14 @@ int main(int argc, char **argv) { FILE *stateOut = NULL; // DiffEv parameters - double F = 0.9; + double F = 0.3; double CR = 0.8; #if COST!=KL_COST && COST!=LNL_COST double u = 0.0; // mutation rate per site per generation long nnuc = 0; // number of nucleotides per haploid genome #endif double ytol = 1e-4; // stop when yspread <= ytol - int strategy = 1; + int strategy = 2; int ptsPerDim = 10; int verbose = 0; SimSched *simSched = SimSched_new(); diff --git a/src/version.h b/src/version.h index a7e5b541..02def427 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.14" +#define VERSION "1.15" #endif From a9f52ed4532311896305be322001baa24dc54b74 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 27 Mar 2018 01:46:07 -0600 Subject: [PATCH 081/101] Added two arguments to axt2raf.py: -minlen : set min alignment length to base pairs -minqual : set min alignment quality to --- src/axt2raf.py | 63 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 14 deletions(-) diff --git a/src/axt2raf.py b/src/axt2raf.py index 2b35a82f..02501f6a 100755 --- a/src/axt2raf.py +++ b/src/axt2raf.py @@ -3,13 +3,26 @@ import sys import datetime +# External variables +minlen = 1 +minqual = 0 + # Print usage message and abort def usage(): msg = \ """ -usage: axt2raf.py +axt2raf: convert axt format to raf format. + +usage: axt2raf.py [options] [inputfile] +where options may include: + + -minlen : set min alignment length to base pairs + -minqual : set min alignment quality to + +Program reads from "inputfile" if provided, or otherwise from +standard input. Input should be in axt format. -where is in axt format. Writes to standard output. +Writes to standard output. """ sys.stdout.flush() print >> sys.stderr, msg @@ -66,8 +79,9 @@ def read(self,infile): exit(1) # After omitting gaps, length of seqA should match header + self.length = self.end - self.start netA = lenA - sA.count("-") - if netA != self.end - self.start: + if netA != self.length: sys.stdout.flush() print >> sys.stderr, \ "non-gap length mismatch: seqA and header in alignment %d" \ @@ -128,8 +142,13 @@ def read(self,infile): # Print alignment def pr(self): - #print "# Alignment %d: [%d, %d)" % \ - # (self.alignment, self.start, self.end) + #print "# Alignment %d: [%d, %d) len=%d qual=%d" % \ + # (self.alignment, self.start, self.end, self.length, self.qual) + + # Filter + if self.length < minlen or self.qual < minqual: + return self + pos = self.start for i in range(len(self.ref)): # omit deletions and missing values @@ -175,15 +194,31 @@ def overlap(a, b): else: return False -if len(sys.argv) != 2: - usage() - -try: - f=open(sys.argv[1]) -except: - sys.stdout.flush() - print >> sys.stderr, "Can't open input file \"%s\"" % sys.argv[1] - exit(1) +f = sys.stdin +i = 1 +while i < len(sys.argv): + if sys.argv[i] == "-minlen": + i += 1 + if i == len(sys.argv): + usage(); + minlen = int(sys.argv[i]) + elif sys.argv[i] == "-minqual": + i += 1 + if i == len(sys.argv): + usage(); + minqual = int(sys.argv[i]) + elif sys.argv[i][0] == "-": + usage() + else: + if f != sys.stdin: + usage() + try: + f=open(sys.argv[i]) + except: + sys.stdout.flush() + print >> sys.stderr, "Can't open input file \"%s\"" % sys.argv[i] + exit(1) + i += 1 a = Alignment() b = Alignment() From 11a3802a594c5cbdd5e78d24cf11168d8155ee6d Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 27 Mar 2018 01:50:02 -0600 Subject: [PATCH 082/101] Modified axt2raf.py. The "pr" method now sets "initialized=False" when an alignment fails filtering criteria. --- src/axt2raf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/axt2raf.py b/src/axt2raf.py index 02501f6a..a59b7d63 100755 --- a/src/axt2raf.py +++ b/src/axt2raf.py @@ -147,6 +147,7 @@ def pr(self): # Filter if self.length < minlen or self.qual < minqual: + self.initialized = False return self pos = self.start From c15b8f6bb8bd91ac05aa666e2b8d54e9f84610db Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 27 Mar 2018 12:09:15 -0600 Subject: [PATCH 083/101] Fixed bug in axt2raf.py. In the old code, "+=" would combine alignments even if one or both of them failed quality control. The merged alignment, being longer, could then pass quality control. Now QC is done when the alignment is initially read, and "+=" won't merge alignments unless both of them pass QC. --- src/axt2raf.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/axt2raf.py b/src/axt2raf.py index a59b7d63..440ad00e 100755 --- a/src/axt2raf.py +++ b/src/axt2raf.py @@ -57,6 +57,7 @@ def read(self,infile): sB=f.readline() if sB == '': self.initialized = False # signal end of file + self.reject = True return self sA = sA.strip().lower() sB = sB.strip().lower() @@ -113,6 +114,12 @@ def read(self,infile): self.qual = int(line[8]) + # Filter + if self.length < minlen or self.qual < minqual: + self.reject = True + else: + self.reject = False + # netA is length of output vectors self.ref = netA * [None] self.alt = netA * [None] @@ -146,7 +153,7 @@ def pr(self): # (self.alignment, self.start, self.end, self.length, self.qual) # Filter - if self.length < minlen or self.qual < minqual: + if self.reject: self.initialized = False return self @@ -178,6 +185,10 @@ def __iadd__(self, other): if other.end < self.end: # other is nested within self: do nothing return self + if other.reject: + return self + if self.reject: + return other else: n = other.start - self.start self.ref = self.ref[0:n] + other.ref From 1204957de4472797511306707a06647e81a411c7 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Fri, 30 Mar 2018 22:21:18 -0600 Subject: [PATCH 084/101] Fixed bug in sitepat. Previous code failed to call RAFReader_setDaf, so derived allele frequency was undefined. --- src/sitepat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sitepat.c b/src/sitepat.c index 408a5b90..59cf0f8e 100644 --- a/src/sitepat.c +++ b/src/sitepat.c @@ -574,6 +574,7 @@ int main(int argc, char **argv) { assert(snpndx < nsnp[chrndx]); #endif } + RAFReader_findDaf(n, r); // p and q are frequencies of derived and ancestral alleles double p[m], q[m]; for(j = 0; j < m; ++j) { From 7c2c7a5d3e737acd4f1ac1443abcc6b744c20607 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Fri, 30 Mar 2018 22:23:03 -0600 Subject: [PATCH 085/101] Version 1.17 --- src/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/version.h b/src/version.h index 02def427..2c107097 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.15" +#define VERSION "1.17" #endif From 3f1a13139bc02605abff39d3f194b61023c9d8e5 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Fri, 30 Mar 2018 22:38:36 -0600 Subject: [PATCH 086/101] Still fixing bugs in sitepat.c. DAF isn't being set properly. --- src/sitepat.c | 17 ++++++++++++++++- src/version.h | 2 +- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/sitepat.c b/src/sitepat.c index 59cf0f8e..9f2ff451 100644 --- a/src/sitepat.c +++ b/src/sitepat.c @@ -554,6 +554,21 @@ int main(int argc, char **argv) { exit(EXIT_FAILURE); } + // Set derived allele frequence in each reader + status = RAFReader_findDaf(n, r); + switch(status) { + case 0: + break; + case NO_ANCESTRAL_ALLELE: + continue; + default: + // something wrong. + mystrerror_r(status, errbuff, sizeof errbuff); + fprintf(stderr,"%s:%d: input error (%s)\n", + __FILE__,__LINE__, errbuff); + exit(EXIT_FAILURE); + } + if(bootreps > 0) { // chrndx is index of current chromosome errno = 0; @@ -574,7 +589,7 @@ int main(int argc, char **argv) { assert(snpndx < nsnp[chrndx]); #endif } - RAFReader_findDaf(n, r); + // p and q are frequencies of derived and ancestral alleles double p[m], q[m]; for(j = 0; j < m; ++j) { diff --git a/src/version.h b/src/version.h index 2c107097..7e4783ae 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.17" +#define VERSION "1.18" #endif From 997a8ff5086185e6ade5fd32a07f0dbd1d622491 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sat, 31 Mar 2018 00:10:41 -0600 Subject: [PATCH 087/101] Version 1.19. Less indentation before population lables in output of sitepat. --- src/sitepat.c | 2 +- src/version.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sitepat.c b/src/sitepat.c index 9f2ff451..d20692bb 100644 --- a/src/sitepat.c +++ b/src/sitepat.c @@ -387,7 +387,7 @@ int main(int argc, char **argv) { printf("# sitepat version %s\n", VERSION); printf("# Population labels:\n"); for(i = 0; i < n; ++i) - printf("# %8s=%s\n", poplbl[i], fname[i]); + printf("# %s=%s\n", poplbl[i], fname[i]); // make sure labels are all different for(i = 1; i < n; ++i) diff --git a/src/version.h b/src/version.h index 7e4783ae..a439a4e2 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.18" +#define VERSION "1.19" #endif From e66f179cb2124e13e0eff713941a55bada024d5e Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sat, 31 Mar 2018 00:32:40 -0600 Subject: [PATCH 088/101] Version 1.20. Removed extra space before population labels in tabpat output. --- src/tabpat.c | 2 +- src/version.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tabpat.c b/src/tabpat.c index 971a05f3..81c8760b 100644 --- a/src/tabpat.c +++ b/src/tabpat.c @@ -376,7 +376,7 @@ int main(int argc, char **argv) { printf("# tabpat version %s\n", VERSION); printf("# Population labels:\n"); for(i = 0; i < n; ++i) - printf("# %4s=%s\n", poplbl[i], fname[i]); + printf("# %s=%s\n", poplbl[i], fname[i]); // make sure labels are all different for(i = 1; i < n; ++i) diff --git a/src/version.h b/src/version.h index a439a4e2..359b64fe 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.19" +#define VERSION "1.20" #endif From bf6a0fda55558decb3a784aa567c6267bc45bb03 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sat, 31 Mar 2018 00:37:51 -0600 Subject: [PATCH 089/101] Version 1.21. Simplified logic in sitepat.c. --- src/sitepat.c | 33 ++++----------------------------- src/version.h | 2 +- 2 files changed, 5 insertions(+), 30 deletions(-) diff --git a/src/sitepat.c b/src/sitepat.c index d20692bb..1f221b55 100644 --- a/src/sitepat.c +++ b/src/sitepat.c @@ -445,6 +445,8 @@ int main(int argc, char **argv) { done=0; while(!done) { status = RAFReader_multiNext(n, r); + if(status==0) + status = RAFReader_findDaf(n, r); switch(status) { case 0: break; @@ -453,20 +455,6 @@ int main(int argc, char **argv) { continue; case REF_MISMATCH: case MULTIPLE_ALT: - continue; - default: - // something wrong. - mystrerror_r(status, errbuff, sizeof errbuff); - fprintf(stderr,"%s:%d: input error (%s)\n", - __FILE__,__LINE__, errbuff); - exit(EXIT_FAILURE); - } - - // Set derived allele frequence in each reader - status = RAFReader_findDaf(n, r); - switch(status) { - case 0: - break; case NO_ANCESTRAL_ALLELE: continue; default: @@ -519,6 +507,8 @@ int main(int argc, char **argv) { done=0; while( !done ) { status = RAFReader_multiNext(n, r); + if(status==0) + status = RAFReader_findDaf(n, r); switch(status) { case 0: ++nsites; @@ -554,21 +544,6 @@ int main(int argc, char **argv) { exit(EXIT_FAILURE); } - // Set derived allele frequence in each reader - status = RAFReader_findDaf(n, r); - switch(status) { - case 0: - break; - case NO_ANCESTRAL_ALLELE: - continue; - default: - // something wrong. - mystrerror_r(status, errbuff, sizeof errbuff); - fprintf(stderr,"%s:%d: input error (%s)\n", - __FILE__,__LINE__, errbuff); - exit(EXIT_FAILURE); - } - if(bootreps > 0) { // chrndx is index of current chromosome errno = 0; diff --git a/src/version.h b/src/version.h index 359b64fe..3bce845d 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.20" +#define VERSION "1.21" #endif From fb6b2d1da3494fabe89eb0ea73d32a95405ceb88 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sat, 31 Mar 2018 11:15:03 -0600 Subject: [PATCH 090/101] Added negative strand alignment to test.axt --- test/test.axt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test.axt b/test/test.axt index 5f520e94..51763481 100644 --- a/test/test.axt +++ b/test/test.axt @@ -22,6 +22,6 @@ ga-a GA ga -5 chr23 13 17 chr17 101 104 + 473100 +5 chr23 13 17 chr17 101 104 - 473100 GATna at-tn From 549f6f8583df48905edab41982865e1180a796b4 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sat, 31 Mar 2018 12:11:56 -0600 Subject: [PATCH 091/101] Change to axt2raf.py. For alignments on the "-" strand, the old code translated both sequences by complementing each nucleotide. The new code complements only the second sequence. --- src/axt2raf.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/axt2raf.py b/src/axt2raf.py index 440ad00e..37059cea 100755 --- a/src/axt2raf.py +++ b/src/axt2raf.py @@ -105,11 +105,10 @@ def read(self,infile): exit(1) # If we're on the negative strand, then translate - # sA and sB by complementing each nucleotide. + # sB (but not sA) by complementing each nucleotide. # See definition of trtab above. strand = line[7] if strand == "-": - sA = sA.translate(trtab) sB = sB.translate(trtab) self.qual = int(line[8]) From 4931efd68835dc8952eda1d9a2d78efbc36bdae9 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sat, 31 Mar 2018 17:55:24 -0600 Subject: [PATCH 092/101] Version 1.22. This commit does nothing but increment the version number, which I forgot to change in the previous commit. --- src/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/version.h b/src/version.h index 3bce845d..ee20a578 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.21" +#define VERSION "1.22" #endif From 33d3cc15645584e87461ca7bfb108b0ef11deeb1 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Wed, 4 Apr 2018 06:17:25 -0600 Subject: [PATCH 093/101] Version 1.23. On exit, legofit no longer prints "converged" or "failed". Instead, there are four options: "reached_goal", "finished_iterations", "was_interrupted", and "stopped_for_an_unknown_reason". I did this because the user should not assume that legofit has failed just because it finished all requested iterations. To make this change, I defined a new enumeration type called "DEStatue", which diffev now returns. --- src/diffev.c | 22 ++++++++++++++-------- src/diffev.h | 13 +++++++++++-- src/legofit.c | 22 +++++++++++++++++----- src/version.h | 2 +- 4 files changed, 43 insertions(+), 16 deletions(-) diff --git a/src/diffev.c b/src/diffev.c index 56fccb85..f4d2ce07 100644 --- a/src/diffev.c +++ b/src/diffev.c @@ -622,8 +622,8 @@ void *getStratFun(int strategy) { } /// The diffev optimizer. -int diffev(int dim, double estimate[dim], double *loCost, double *yspread, - DiffEvPar dep, gsl_rng * rng) { +DEStatus diffev(int dim, double estimate[dim], double *loCost, double *yspread, + DiffEvPar dep, gsl_rng * rng) { int i, j; // counting variables int imin = INT_MAX; // index to member with lowest energy @@ -638,6 +638,7 @@ int diffev(int dim, double estimate[dim], double *loCost, double *yspread, int nPts = dep.dim * dep.ptsPerDim; int status; + DEStatus destat = Running; int ndx[nPts]; for(i = 0; i < nPts; ++i) ndx[i] = i; @@ -794,8 +795,10 @@ int diffev(int dim, double estimate[dim], double *loCost, double *yspread, if(sigstat==SIGUSR1) sigstat=0; } - if(sigstat==SIGINT) + if(sigstat==SIGINT) { + destat = Interrupted; break; + } if(stage==nstages-1 && *yspread <= dep.ytol) break; } @@ -803,13 +806,16 @@ int diffev(int dim, double estimate[dim], double *loCost, double *yspread, JobQueue_noMoreJobs(jq); if(*yspread <= dep.ytol) { - status = 0; + destat = ReachedGoal; + if(verbose) + fputs("ReachedGoal\n", stderr); + } else if(destat==Interrupted) { if(verbose) - fputs("Converged\n", stdout); + fputs("Interrupted\n", stderr); } else { - status = 1; + destat = FinishedIterations; if(verbose) - fputs("No convergence\n", stdout); + fputs("FinishedIterations\n", stderr); } // Return estimates @@ -830,7 +836,7 @@ int diffev(int dim, double estimate[dim], double *loCost, double *yspread, } JobQueue_free(jq); - return status; + return destat; } /// Choose at random k distinct integers from array of n, placing diff --git a/src/diffev.h b/src/diffev.h index ecf1b691..d2749f42 100644 --- a/src/diffev.h +++ b/src/diffev.h @@ -12,6 +12,15 @@ typedef struct TaskArg TaskArg; typedef struct DiffEv DiffEv; typedef struct DiffEvPar DiffEvPar; +typedef enum DEStatus DEStatus; + +// DE sets its status to Running on entry. If status still equals +// Running on return, something is wrong. +// On return, status should equal one of the following: +// ReachedGoal: DE stopped because yspread <= ytol +// FinishedIterations: DE stopped after completing all iterations +// Interrupted: DE stopped on a signal (SIGINT or SIGTERM) +enum DEStatus {ReachedGoal, FinishedIterations, Interrupted, Running}; struct DiffEvPar { int dim, ptsPerDim, refresh, strategy, nthreads, verbose; @@ -35,8 +44,8 @@ struct DiffEvPar { State *state; }; -int diffev(int dim, double estimate[dim], double *loCost, - double *yspread, DiffEvPar dep, gsl_rng * rng); +DEStatus diffev(int dim, double estimate[dim], double *loCost, + double *yspread, DiffEvPar dep, gsl_rng * rng); const char *diffEvStrategyLbl(int i); void sighandle(int signo); void handleSIGTERM(int signo); diff --git a/src/legofit.c b/src/legofit.c index 20e4c7b2..33b69037 100644 --- a/src/legofit.c +++ b/src/legofit.c @@ -620,13 +620,25 @@ int main(int argc, char **argv) { // possible while diffev is running. fflush(stdout); - status = diffev(dim, estimate, &cost, &yspread, dep, rng); - - if(sigstat == SIGINT) - printf("Job terminated early in reponse to signal.\n"); + DEStatus destat = diffev(dim, estimate, &cost, &yspread, dep, rng); + + const char *whyDEstopped; + switch(destat) { + case ReachedGoal: + whyDEstopped = "reached_goal"; + break; + case FinishedIterations: + whyDEstopped = "finished_iterations"; + break; + case Interrupted: + whyDEstopped = "was_interrupted"; + break; + default: + whyDEstopped = "stopped_for_an_unknown_reason"; + } printf("DiffEv %s. cost=%0.5le spread=%0.5le\n", - status == 0 ? "converged" : "FAILED", cost, yspread); + whyDEstopped, cost, yspread); #if COST==LNL_COST printf(" relspread=%e", yspread / cost); #endif diff --git a/src/version.h b/src/version.h index ee20a578..c0a8c22c 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.22" +#define VERSION "1.23" #endif From 74cccaf7dadb3d3bedc94aa4e576011c78cf6ca5 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Wed, 4 Apr 2018 06:41:04 -0600 Subject: [PATCH 094/101] . --- src/legofit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/legofit.c b/src/legofit.c index 33b69037..56a1b449 100644 --- a/src/legofit.c +++ b/src/legofit.c @@ -117,8 +117,8 @@ singletons, by using the `-1` option of @\ref tabpat "tabpat". previous code, differential evolution (DE) iterations stopped when neither the best objective function value nor the spread of these values had changed in a fixed number of iterations. That criterion was -used in our recent paper, "Early history of Neanderthals and -Denisovans", which was just published in PNAS. +used in our PNAS paper, "Early history of Neanderthals and +Denisovans". I began to notice convergence problems with models larger than those used in the August 2017 PNAS paper. All bootstrap replicates would From ae06f9e0c8490e05ee77466d9e9fd2f98c4fb54f Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Wed, 4 Apr 2018 17:05:51 -0600 Subject: [PATCH 095/101] add raf2daf.c --- src/raf2daf.c | 683 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 683 insertions(+) create mode 100644 src/raf2daf.c diff --git a/src/raf2daf.c b/src/raf2daf.c new file mode 100644 index 00000000..07138b78 --- /dev/null +++ b/src/raf2daf.c @@ -0,0 +1,683 @@ +/** +@file raf2daf.c +@page raf2daf +@brief Convert raf files to daf files + +# raf2daf: converts raf files to daf files + +Raf2daf reads data in .raf format and writes the corresponding daf files. + +# Usage + + Usage: raf2daf [options] = = ... + where and are arbitrary labels, and and are input + files in raf format. Writes to standard output. Labels may not include + the character ":". Maximum number of input files: 32. + + Options may include: + -f or --bootfile + Bootstrap output file basename. Def: raf2daf.boot. + -r or --bootreps + # of bootstrap replicates. Def: 0 + -b or --blocksize + # of SNPs per block in moving-blocks bootstrap. Def: 0. + -1 or --singletons + Use singleton site patterns + -m or --logMismatch + log AA/DA mismatches to raf2daf.log + -A or --logAA + log sites with uncallable ancestral alleles + -a or --logAll + log all sites to raf2daf.log + -h or --help + Print this message + +# Example + +Before running `raf2daf`, use @ref raf "raf" to convert the input data +into raf format. Let us assume you have done this, and that directory +~/raf contains a separate raf file for each population. We want to +compare 4 populations, whose .raf files are `yri.raf`, `ceu.raf`, +`altai.raf`, and `denisova.raf`. The following command will do this, +putting the results into `obs.txt`. + + raf2daf x=~/raf/yri.raf \ + y=~/raf/ceu.raf \ + n=~/raf/altai.raf \ + d=~/raf/denisova.raf > obs.txt + +Here, "x", "y", "n", and "d" are labels that will be used to identify +site patterns in the output. For example, site pattern "x:y" refers to +the pattern in which the derived allele is present haploid samples +from "x" and "y" but not on those from other populations. The order of +the command-line arguments determines the order in which labels are +sorted on output. Given the command line above, we would get a site +pattern labeled "x:y:d" rather than, say, "y:x:d". + +The output looks like this: + + # Population labels: + # x = /home/rogers/raf/yri.raf + # y = /home/rogers/raf/ceu.raf + # n = /home/rogers/raf/altai.raf + # d = /home/rogers/raf/denisova.raf + # Excluding singleton site patterns. + # Number of site patterns: 10 + # Tabulated 12327755 SNPs + # SitePat E[count] + x:y 340952.4592501 + x:n 46874.1307236 + x:d 46034.4670204 + y:n 55137.4236715 + y:d 43535.5248078 + n:d 231953.3372578 + x:y:n 91646.1277991 + x:y:d 88476.9619569 + x:n:d 96676.3877423 + y:n:d 100311.4411513 + +The left column lists the site patterns that occur in the data. The +right column gives the expected count of each site pattern. These are +not integers, because they represent averages over all possible +subsamples consisting of a single haploid genome from each +population. + +In the raf files used as input, chromosomes should appear in lexical +order. Within each chromosome, nucleotides should appear in numerical +order. There should be no duplicate (chromosome, position) +pairs. Otherwise, the program aborts with an error. + +To generate a bootstrap, use the `--bootreps` option: + + sitepat --bootreps 50 \ + x=~/raf/yri.raf \ + y=~/raf/ceu.raf \ + n=~/raf/altai.raf \ + d=~/raf/denisova.raf > obs.txt + +This will generate not only the primary output file, `obs.txt`, but also +50 additional files, each representing a single bootstrap +replicate. The primary output file now has a bootstrap confidence +interval: + + # Population labels: + # x = /home/rogers/raf/yri.raf + # y = /home/rogers/raf/ceu.raf + # n = /home/rogers/raf/altai.raf + # d = /home/rogers/raf/denisova.raf + # Excluding singleton site patterns. + # Number of site patterns: 10 + # Tabulated 12327755 SNPs + # bootstrap output file = sitepat.boot + # confidence level = 95% + # SitePat E[count] loBnd hiBnd + x:y 340952.4592501 338825.6604586 342406.6670816 + x:n 46874.1307236 46361.5798377 47438.1857029 + x:d 46034.4670204 45605.6588012 46631.6434277 + y:n 55137.4236715 54650.0763578 55783.7051253 + y:d 43535.5248078 43110.5119922 44234.0919024 + n:d 231953.3372578 229495.3741057 234173.6878092 + x:y:n 91646.1277991 90494.0219749 92873.4443706 + x:y:d 88476.9619569 87137.1867967 89585.8431419 + x:n:d 96676.3877423 95935.5184294 97417.6241185 + y:n:d 100311.4411513 99292.9839140 101163.3457462 + +Here, `loBnd` and `hiBnd` are the limits of a 95% confidence +interval. The bootstrap output files look like `sitepat.boot000`, +`sitepat.boot001`, and so on. + +@copyright Copyright (c) 2016, Alan R. Rogers +. This file is released under the Internet +Systems Consortium License, which can be found in file "LICENSE". +*/ + +#include "binary.h" +#include "boot.h" +#include "rafreader.h" +#include "misc.h" +#include "strint.h" +#include "error.h" +#include "typedefs.h" +#include "version.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAXCHR 24 // maximum number of chromosomes + +typedef struct Stack Stack; + +/// Treat a vector of tipId_t values as a push-down stack. +struct Stack { + int dim, nused; + tipId_t *buff; // not locally owned +}; + +static void usage(void); +static Stack *Stack_new(int dim, tipId_t buff[dim]); +static void Stack_free(Stack * stk); +static void Stack_push(Stack * self, tipId_t x); +static void generatePatterns(int bit, int npops, Stack * stk, tipId_t pat, + int doSing); + +const char *useMsg = + "\nUsage: raf2daf [options] = = ... outgroup=\n" + " where and are arbitrary labels, and are input\n" + " files in raf format. Writes to standard output." + " Labels may not include\n" + " the character \":\". Final label must be \"outgroup\".\n"; + +/// Print usage message and die. +static void usage(void) { + fputs(useMsg, stderr); + fprintf(stderr, " Maximum number of input files: %lu plus outgroup.\n", + 8 * sizeof(tipId_t)); + fputs("\nOptions may include:\n", stderr); + tellopt("-f or --bootfile ", + "Bootstrap output file basename. Def: sitepat.boot."); + tellopt("-r or --bootreps ", "# of bootstrap replicates. Def: 0"); + tellopt("-b or --blocksize ", + "# of SNPs per block in moving-blocks bootstrap. Def: 0."); + tellopt("-1 or --singletons", "Use singleton site patterns"); + tellopt("-m or --logMismatch", "Log REF mismatches to raf2daf.log"); + tellopt("-A or --logAA", "Log sites with uncallable ancestral allele"); + tellopt("--version", "Print version and exit"); + tellopt("-h or --help", "Print this message"); + exit(1); +} + +/// This stack is local to this file. It provides a bounds-controlled +/// interface to an external array, which is passed as an argument, buff, +/// to Stack_new. +static Stack *Stack_new(int dim, tipId_t buff[dim]) { + Stack *self = malloc(sizeof(Stack)); + CHECKMEM(self); + self->dim = dim; + self->buff = buff; + self->nused = 0; + return self; +} + +/// Frees the stack but not the underlying buffer. +static void Stack_free(Stack * stk) { + free(stk); +} + +/// Add an entry to the stack, checking bounds. +static void Stack_push(Stack * self, tipId_t x) { + if(self->nused == self->dim) { + fprintf(stderr, "%s:%s:%d ERR: buffer overflow\n", + __FILE__, __func__, __LINE__); + exit(EXIT_FAILURE); + } + self->buff[self->nused++] = x; +} + +/// Call as generatePatterns(0, npops, stk, 0); Recursive function, +/// which generates all legal site patterns and pushes them onto a +/// stack. +static void +generatePatterns(int bit, int npops, Stack * stk, tipId_t pat, int doSing) { + assert(sizeof(tipId_t) < sizeof(unsigned long long)); + if(bit == npops) { + // Recursion stops here. If current pattern is + // legal, then push it onto the stack. Then return. + + // Exclude patterns with all bits on, or all bits off. + if(pat == 0 || pat == (1ULL << npops) - 1ULL) + return; + // Exclude singleton patterns unless "doSing" is true. + if(!doSing && isPow2(pat)) + return; + Stack_push(stk, pat); + return; + } + tipId_t on = 1UL << bit; + generatePatterns(bit + 1, npops, stk, pat | on, doSing); // curr bit on + generatePatterns(bit + 1, npops, stk, pat, doSing); // curr bit off +} + +int main(int argc, char **argv) { + int i, j, status, optndx, done; + int doSing = 0; // nonzero means use singleton site patterns + long bootreps = 0; + double conf = 0.95; // confidence level + long blocksize = 500; + StrInt *strint = StrInt_new(); + char bootfname[FILENAMESIZE] = { '\0' }; + char errbuff[100] = { '\0' }; + const char *logfname = "raf2daf.log"; + int logMismatch = 0, logAA = 0; + FILE *logfile = NULL; + + static struct option myopts[] = { + // {char *name, int has_arg, int *flag, int val} + {"bootfile", required_argument, 0, 'f'}, + {"bootreps", required_argument, 0, 'r'}, + {"blocksize", required_argument, 0, 'b'}, + {"singletons", no_argument, 0, '1'}, + {"logMismatch", no_argument, 0, 'm'}, + {"logAA", no_argument, 0, 'A'}, + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'V'}, + {NULL, 0, NULL, 0} + }; + + // command line arguments + for(;;) { + i = getopt_long(argc, argv, "b:c:f:hr:t:mAv1", myopts, &optndx); + if(i == -1) + break; + switch (i) { + case ':': + case '?': + usage(); + break; + case 'b': + blocksize = strtod(optarg, NULL); + if(blocksize <= 0) { + fprintf(stderr, + "%s:%d: bad argument to -b or --blocksize: \"%s\"\n", + __FILE__, __LINE__, optarg); + usage(); + } + break; + case 'f': + status = snprintf(bootfname, sizeof bootfname, "%s", optarg); + if(status >= sizeof bootfname) { + fprintf(stderr, "%s:%d: ERR: Filename %s is too large." + " Max: %zu\n", + __FILE__, __LINE__, optarg, sizeof(bootfname) - 1); + exit(EXIT_FAILURE); + } + break; + case 'V': + printf("raf2daf version %s\n", VERSION); + return 0; + case 'h': + usage(); + break; + case 'r': + bootreps = strtol(optarg, NULL, 10); + break; + case '1': + doSing = 1; + break; + case 'm': + logMismatch = 1; + break; + case 'A': + logAA = 1; + break; + default: + usage(); + } + } + + // remaining options: input files + int n = argc - optind; // number of input files + int m = n-1; // number excluding outgroup + if(n == 0) + usage(); + + char *poplbl[n]; + char *fname[n]; + LblNdx lndx; + LblNdx_init(&lndx); + RAFReader *r[n]; + + // Number of inputs can't exceed number of bits in an object of + // type tipId_t. + if(m > 8 * sizeof(tipId_t)) { + fprintf(stderr, "Error: %d input files. Max is %lu.\n", + n, 8*sizeof(tipId_t) + 1); + usage(); + } + // Parse remaining arguments, each of which should be of form + // x=foo, where x is an arbitrary label and foo is the name of an + // input file. Last label must be "outgroup". + for(i = 0; i < n; ++i) { + fname[i] = poplbl[i] = argv[i + optind]; + (void) strsep(fname + i, "="); + if(fname[i] == NULL + || poplbl[i] == NULL + || strlen(poplbl[i]) == 0 + || strlen(fname[i]) == 0 || strchr(poplbl[i], ':') != NULL) + usage(); + if(i < m) + LblNdx_addSamples(&lndx, 1, poplbl[i]); + r[i] = RAFReader_new(fname[i]); + } + if(0 != strcmp("outgroup", poplbl[n-1])) { + fprintf(stderr,"%s:%d: last label is \"%s\"" + " instead of \"outgroup\".\n", + __FILE__,__LINE__, poplbl[n-1]); + usage(); + } + + if(logMismatch || logAA) { + logfile = fopen(logfname, "w"); + if(logfile == NULL) { + fprintf(stderr, "Can't write to file \"%s\".\n", logfname); + exit(EXIT_FAILURE); + } + } + + // Default boot file name + if(bootfname[0] == '\0') { + const char *defName = "raf2daf.boot"; + status = snprintf(bootfname, sizeof bootfname, "%s", defName); + if(status >= sizeof bootfname) { + fprintf(stderr, "%s:%d: ERR: Filename %s is too large." + " Max: %zu\n", + __FILE__, __LINE__, defName, sizeof(bootfname) - 1); + exit(EXIT_FAILURE); + } + } + + printf("# raf2daf version %s\n", VERSION); + printf("# Population labels:\n"); + for(i = 0; i < n; ++i) + printf("# %s=%s\n", poplbl[i], fname[i]); + + // make sure labels are all different + for(i = 1; i < n; ++i) + for(j = 0; j < i; ++j) + if(0 == strcmp(poplbl[i], poplbl[j])) { + fprintf(stderr, "ERR: duplicate labels on command line.\n"); + fprintf(stderr, " duplicated label: %s\n", poplbl[i]); + exit(EXIT_FAILURE); + } + + unsigned long npat = (1UL << m) - 2UL; // number of site patterns + if(!doSing) + npat -= m; + printf("# %s singleton site patterns.\n", + (doSing ? "Including" : "Excluding")); + printf("# Number of site patterns: %lu\n", npat); + tipId_t pat[npat]; + double patCount[npat]; + int lblsize = 100; + char lblbuff[lblsize]; + memset(patCount, 0, sizeof(patCount)); + + { + // Stack is a interface to array "pat". + Stack *stk = Stack_new(npat, pat); + + // Put site patterns into array "pat". + generatePatterns(0, m, stk, 0, doSing); + Stack_free(stk); + } + + // Sort site patterns. Major sort is by number of "on" bits, + // so that doubleton patterns come first, then tripletons, ets. + // Secondary sort is by order in which labels are listed + // on the command line. + qsort(pat, (size_t) npat, sizeof(pat[0]), compare_tipId); + fflush(stdout); + + // Used by bootstrap + Boot *boot = NULL; + int nchr = 0; + char prev[RAFSTRSIZE], chr[RAFSTRSIZE] = { '\0' }; + long nsnp[MAXCHR]; + memset(nsnp, 0, sizeof nsnp); + + // Read the data to get dimensions: number of chromosomes and + // number of snps per chromosome. Then use these dimensions to + // allocate a bootstrap object. + if(bootreps > 0) { + fprintf(stderr, "Doing 1st pass through data to get dimensions...\n"); + + // First pass through data sets values of + // nchr + // nsnp[i] {i=0..nchr-1} + done=0; + while(!done) { + status = RAFReader_multiNext(n, r); + if(status==0) + status = RAFReader_findDaf(n, r); + switch(status) { + case 0: + break; + case EOF: + done=1; + continue; + case REF_MISMATCH: + case MULTIPLE_ALT: + case NO_ANCESTRAL_ALLELE: + continue; + default: + // something wrong. + mystrerror_r(status, errbuff, sizeof errbuff); + fprintf(stderr,"%s:%d: input error (%s)\n", + __FILE__,__LINE__, errbuff); + exit(EXIT_FAILURE); + } + + assert(strlen(RAFReader_chr(r[0])) < sizeof prev); + strcpy(prev, chr); + strcpy(chr, RAFReader_chr(r[0])); + int diff = strcmp(prev, chr); + if(diff != 0) { + StrInt_insert(strint, chr, nchr); + nsnp[nchr] = 1; + ++nchr; + } else + ++nsnp[nchr - 1]; + } + + for(i = 0; i < n; ++i) { + status = RAFReader_rewind(r[i]); + if(status) { + fprintf(stderr, "%s:%d: ERR: can't rewind input stream.\n", + __FILE__, __LINE__); + fprintf(stderr, " If --bootreps > 0, inputs must be" + " files, not pipes.\n"); + exit(EXIT_FAILURE); + } + } + + // Allocate Boot structure + gsl_rng *rng = gsl_rng_alloc(gsl_rng_taus); + gsl_rng_set(rng, (unsigned long) time(NULL)); + boot = Boot_new(nchr, nsnp, bootreps, npat, blocksize, rng); + gsl_rng_free(rng); + CHECKMEM(boot); + } + + unsigned long nsites = 0, nbadaa = 0, nbadref=0, nmultalt=0; + long snpndx = -1; + + // Iterate through raf files + fprintf(stderr, "Doing %s pass through data to tabulate patterns..\n", + bootreps > 0 ? "2nd" : "single"); + int chrndx = -1, currChr = INT_MAX; + RAFReader_clearChromosomes(n, r); + done=0; + while( !done ) { + status = RAFReader_multiNext(n, r); + if(status==0) + status = RAFReader_findDaf(n, r); + switch(status) { + case 0: + ++nsites; + break; + case EOF: + done=1; + continue; + case REF_MISMATCH: + ++nsites; + ++nbadref; + if(logMismatch) { + fprintf(logfile,"REF mismatch:\n"); + RAFReader_printArray(n, r, logfile); + } + continue; + case MULTIPLE_ALT: + ++nsites; + ++nmultalt; + continue; + case NO_ANCESTRAL_ALLELE: + ++nsites; + ++nbadaa; + if(logAA) { + fprintf(logfile,"Uncallable AA:\n"); + RAFReader_printArray(n, r, logfile); + } + continue; + default: + // something wrong. + mystrerror_r(status, errbuff, sizeof errbuff); + fprintf(stderr,"%s:%d: input error (%s)\n", + __FILE__,__LINE__, errbuff); + exit(EXIT_FAILURE); + } + + if(bootreps > 0) { + // chrndx is index of current chromosome + errno = 0; + chrndx = StrInt_get(strint, RAFReader_chr(r[0])); + if(errno) { + fprintf(stderr, + "%s:%d: ERR: missing index for chromosome: %s\n", + __FILE__, __LINE__, RAFReader_chr(r[0])); + exit(EXIT_FAILURE); + } + if(chrndx != currChr) { + currChr = chrndx; + snpndx = 0; + } else + ++snpndx; + +#ifndef NDEBUG + assert(snpndx < nsnp[chrndx]); +#endif + } + + // p and q are frequencies of derived and ancestral alleles + double p[m], q[m]; + for(j = 0; j < m; ++j) { + p[j] = RAFReader_daf(r[j]); // derived allele freq + q[j] = 1.0 - p[j]; + } + + // Contribution of current snp to each site pattern. Inner + // loop considers each bit in current pattern. If that bit is + // on, multiply z by the derived allele frequency, p. If + // that bit is off, multiply by q=1-p. In the end, z is Prod + // p[j]^bit[j] * q[j]^(1-bit[j]) where bit[j] is the value (0 + // or 1) of the j'th bit. + for(i = 0; i < npat; ++i) { + tipId_t pattern = pat[i]; + double z = 1.0; + for(j = 0; j < m; ++j) { + if(pattern & 1u) + z *= p[j]; + else + z *= q[j]; + pattern >>= 1u; + } + if(!isfinite(z)) { + fprintf(stderr, "%s:%d nonfinite z=%lf\n", + __FILE__, __LINE__, z); + fprintf(stderr, " pattern=%d\n", pat[i]); + for(j = 0; j < m; ++j) + fprintf(stderr, " %d: p=%lf q=%lf\n", j, p[j], q[j]); + } + assert(0 == (pattern & 1)); + patCount[i] += z; + if(bootreps > 0) { + assert(snpndx >= 0); + assert(chrndx >= 0); + Boot_add(boot, chrndx, snpndx, i, z); + } + } +#ifndef NDEBUG + if(bootreps > 0) + Boot_sanityCheck(boot, __FILE__, __LINE__); +#endif + } + printf("# Aligned sites : %lu\n", nsites); + if(nbadref) + printf("# Disagreements about ref allele : %lu\n", nbadref); + if(nmultalt) + printf("# Sites with multiple alt alleles: %lu\n", nmultalt); + if(nbadaa) + printf("# Undetermined ancestral allele : %lu\n", nbadaa); + printf("# Sites used : %lu\n", + nsites - nbadaa - nbadref - nmultalt); + + // boottab[i][j] is the count of the j'th site pattern + // in the i'th bootstrap replicate. + double bootvals[bootreps]; + double boottab[bootreps][npat]; + memset(boottab, 0, sizeof boottab); + + if(bootreps > 0) { + printf("# %s = %s\n", "bootstrap output file", bootfname); + printf("# %s = %4.2lf%%\n", "confidence level", 100 * conf); +#ifndef NDEBUG + Boot_sanityCheck(boot, __FILE__, __LINE__); +#endif + // put site pattern counts into matrix boottab. + for(i = 0; i < bootreps; ++i) + Boot_aggregate(boot, i, npat, boottab[i]); + + // write an output file for each bootstrap replicate + for(j = 0; j < bootreps; ++j) { + char buff[FILENAMESIZE + 3]; + status = snprintf(buff, sizeof buff, "%s%03d", bootfname, j); + if(status >= sizeof buff) + DIE("buffer overflow in snprintf"); + + FILE *fp = fopen(buff, "w"); + if(fp == NULL) { + fprintf(stderr,"%s:%d: can't open \"%s\" for output.\n", + __FILE__,__LINE__,buff); + exit(EXIT_FAILURE); + } + fprintf(fp, "# %13s %20s", "Raf2daf", "E[count]\n"); + for(i = 0; i < npat; ++i) { + fprintf(fp, "%15s %20.7lf\n", + patLbl(lblsize, lblbuff, pat[i], &lndx), + boottab[j][i]); + } + fclose(fp); + } + } + // print labels and binary representation of site patterns + printf("# %13s %20s", "Raf2daf", "E[count]"); + if(bootreps > 0) + printf(" %15s %15s", "loBnd", "hiBnd"); + putchar('\n'); + for(i = 0; i < npat; ++i) { + printf("%15s %20.7lf", + patLbl(lblsize, lblbuff, pat[i], &lndx), patCount[i]); + if(bootreps > 0) { + double lowBnd, highBnd; + for(j = 0; j < bootreps; ++j) + bootvals[j] = boottab[j][i]; + confidenceBounds(&lowBnd, &highBnd, conf, bootreps, bootvals); + printf(" %15.7lf %15.7lf", lowBnd, highBnd); + } + putchar('\n'); + } + + for(i = 0; i < n; ++i) + RAFReader_free(r[i]); + if(bootreps > 0) + Boot_free(boot); + StrInt_free(strint); + if(logfile) + fclose(logfile); + fprintf(stderr, "raf2daf is finished\n"); + return 0; +} From 491d4b0a9ab88a9b2d84d4989bc4b69e7e5aec6d Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sat, 7 Apr 2018 17:55:38 -0600 Subject: [PATCH 096/101] First draft of raf2daf.c. Compiles. I haven't tried linking it. --- src/raf2daf.c | 564 ++++++-------------------------------------------- 1 file changed, 58 insertions(+), 506 deletions(-) diff --git a/src/raf2daf.c b/src/raf2daf.c index 07138b78..75a6ab57 100644 --- a/src/raf2daf.c +++ b/src/raf2daf.c @@ -9,20 +9,12 @@ Raf2daf reads data in .raf format and writes the corresponding daf files. # Usage - Usage: raf2daf [options] = = ... - where and are arbitrary labels, and and are input - files in raf format. Writes to standard output. Labels may not include - the character ":". Maximum number of input files: 32. + Usage: raf2daf [options] ... + where and are input files in raf format. The last + input file should be the outgroup. Writes to standard + output. Options may include: - -f or --bootfile - Bootstrap output file basename. Def: raf2daf.boot. - -r or --bootreps - # of bootstrap replicates. Def: 0 - -b or --blocksize - # of SNPs per block in moving-blocks bootstrap. Def: 0. - -1 or --singletons - Use singleton site patterns -m or --logMismatch log AA/DA mismatches to raf2daf.log -A or --logAA @@ -32,112 +24,15 @@ Raf2daf reads data in .raf format and writes the corresponding daf files. -h or --help Print this message -# Example - -Before running `raf2daf`, use @ref raf "raf" to convert the input data -into raf format. Let us assume you have done this, and that directory -~/raf contains a separate raf file for each population. We want to -compare 4 populations, whose .raf files are `yri.raf`, `ceu.raf`, -`altai.raf`, and `denisova.raf`. The following command will do this, -putting the results into `obs.txt`. - - raf2daf x=~/raf/yri.raf \ - y=~/raf/ceu.raf \ - n=~/raf/altai.raf \ - d=~/raf/denisova.raf > obs.txt - -Here, "x", "y", "n", and "d" are labels that will be used to identify -site patterns in the output. For example, site pattern "x:y" refers to -the pattern in which the derived allele is present haploid samples -from "x" and "y" but not on those from other populations. The order of -the command-line arguments determines the order in which labels are -sorted on output. Given the command line above, we would get a site -pattern labeled "x:y:d" rather than, say, "y:x:d". - -The output looks like this: - - # Population labels: - # x = /home/rogers/raf/yri.raf - # y = /home/rogers/raf/ceu.raf - # n = /home/rogers/raf/altai.raf - # d = /home/rogers/raf/denisova.raf - # Excluding singleton site patterns. - # Number of site patterns: 10 - # Tabulated 12327755 SNPs - # SitePat E[count] - x:y 340952.4592501 - x:n 46874.1307236 - x:d 46034.4670204 - y:n 55137.4236715 - y:d 43535.5248078 - n:d 231953.3372578 - x:y:n 91646.1277991 - x:y:d 88476.9619569 - x:n:d 96676.3877423 - y:n:d 100311.4411513 - -The left column lists the site patterns that occur in the data. The -right column gives the expected count of each site pattern. These are -not integers, because they represent averages over all possible -subsamples consisting of a single haploid genome from each -population. - -In the raf files used as input, chromosomes should appear in lexical -order. Within each chromosome, nucleotides should appear in numerical -order. There should be no duplicate (chromosome, position) -pairs. Otherwise, the program aborts with an error. - -To generate a bootstrap, use the `--bootreps` option: - - sitepat --bootreps 50 \ - x=~/raf/yri.raf \ - y=~/raf/ceu.raf \ - n=~/raf/altai.raf \ - d=~/raf/denisova.raf > obs.txt - -This will generate not only the primary output file, `obs.txt`, but also -50 additional files, each representing a single bootstrap -replicate. The primary output file now has a bootstrap confidence -interval: - - # Population labels: - # x = /home/rogers/raf/yri.raf - # y = /home/rogers/raf/ceu.raf - # n = /home/rogers/raf/altai.raf - # d = /home/rogers/raf/denisova.raf - # Excluding singleton site patterns. - # Number of site patterns: 10 - # Tabulated 12327755 SNPs - # bootstrap output file = sitepat.boot - # confidence level = 95% - # SitePat E[count] loBnd hiBnd - x:y 340952.4592501 338825.6604586 342406.6670816 - x:n 46874.1307236 46361.5798377 47438.1857029 - x:d 46034.4670204 45605.6588012 46631.6434277 - y:n 55137.4236715 54650.0763578 55783.7051253 - y:d 43535.5248078 43110.5119922 44234.0919024 - n:d 231953.3372578 229495.3741057 234173.6878092 - x:y:n 91646.1277991 90494.0219749 92873.4443706 - x:y:d 88476.9619569 87137.1867967 89585.8431419 - x:n:d 96676.3877423 95935.5184294 97417.6241185 - y:n:d 100311.4411513 99292.9839140 101163.3457462 - -Here, `loBnd` and `hiBnd` are the limits of a 95% confidence -interval. The bootstrap output files look like `sitepat.boot000`, -`sitepat.boot001`, and so on. - -@copyright Copyright (c) 2016, Alan R. Rogers +@copyright Copyright (c) 2018, Alan R. Rogers . This file is released under the Internet Systems Consortium License, which can be found in file "LICENSE". */ -#include "binary.h" -#include "boot.h" #include "rafreader.h" #include "misc.h" #include "strint.h" #include "error.h" -#include "typedefs.h" #include "version.h" #include #include @@ -149,42 +44,18 @@ Systems Consortium License, which can be found in file "LICENSE". #include #include -#define MAXCHR 24 // maximum number of chromosomes - -typedef struct Stack Stack; - -/// Treat a vector of tipId_t values as a push-down stack. -struct Stack { - int dim, nused; - tipId_t *buff; // not locally owned -}; - static void usage(void); -static Stack *Stack_new(int dim, tipId_t buff[dim]); -static void Stack_free(Stack * stk); -static void Stack_push(Stack * self, tipId_t x); -static void generatePatterns(int bit, int npops, Stack * stk, tipId_t pat, - int doSing); const char *useMsg = - "\nUsage: raf2daf [options] = = ... outgroup=\n" - " where and are arbitrary labels, and are input\n" - " files in raf format. Writes to standard output." - " Labels may not include\n" - " the character \":\". Final label must be \"outgroup\".\n"; + "\nUsage: raf2daf [options] ... \n" + " where are input files in raf format, the last of which\n" + " should be the outgroup. Output daf files have names like those\n" + " of input files but with raf changed to daf."; /// Print usage message and die. static void usage(void) { fputs(useMsg, stderr); - fprintf(stderr, " Maximum number of input files: %lu plus outgroup.\n", - 8 * sizeof(tipId_t)); fputs("\nOptions may include:\n", stderr); - tellopt("-f or --bootfile ", - "Bootstrap output file basename. Def: sitepat.boot."); - tellopt("-r or --bootreps ", "# of bootstrap replicates. Def: 0"); - tellopt("-b or --blocksize ", - "# of SNPs per block in moving-blocks bootstrap. Def: 0."); - tellopt("-1 or --singletons", "Use singleton site patterns"); tellopt("-m or --logMismatch", "Log REF mismatches to raf2daf.log"); tellopt("-A or --logAA", "Log sites with uncallable ancestral allele"); tellopt("--version", "Print version and exit"); @@ -192,65 +63,8 @@ static void usage(void) { exit(1); } -/// This stack is local to this file. It provides a bounds-controlled -/// interface to an external array, which is passed as an argument, buff, -/// to Stack_new. -static Stack *Stack_new(int dim, tipId_t buff[dim]) { - Stack *self = malloc(sizeof(Stack)); - CHECKMEM(self); - self->dim = dim; - self->buff = buff; - self->nused = 0; - return self; -} - -/// Frees the stack but not the underlying buffer. -static void Stack_free(Stack * stk) { - free(stk); -} - -/// Add an entry to the stack, checking bounds. -static void Stack_push(Stack * self, tipId_t x) { - if(self->nused == self->dim) { - fprintf(stderr, "%s:%s:%d ERR: buffer overflow\n", - __FILE__, __func__, __LINE__); - exit(EXIT_FAILURE); - } - self->buff[self->nused++] = x; -} - -/// Call as generatePatterns(0, npops, stk, 0); Recursive function, -/// which generates all legal site patterns and pushes them onto a -/// stack. -static void -generatePatterns(int bit, int npops, Stack * stk, tipId_t pat, int doSing) { - assert(sizeof(tipId_t) < sizeof(unsigned long long)); - if(bit == npops) { - // Recursion stops here. If current pattern is - // legal, then push it onto the stack. Then return. - - // Exclude patterns with all bits on, or all bits off. - if(pat == 0 || pat == (1ULL << npops) - 1ULL) - return; - // Exclude singleton patterns unless "doSing" is true. - if(!doSing && isPow2(pat)) - return; - Stack_push(stk, pat); - return; - } - tipId_t on = 1UL << bit; - generatePatterns(bit + 1, npops, stk, pat | on, doSing); // curr bit on - generatePatterns(bit + 1, npops, stk, pat, doSing); // curr bit off -} - int main(int argc, char **argv) { int i, j, status, optndx, done; - int doSing = 0; // nonzero means use singleton site patterns - long bootreps = 0; - double conf = 0.95; // confidence level - long blocksize = 500; - StrInt *strint = StrInt_new(); - char bootfname[FILENAMESIZE] = { '\0' }; char errbuff[100] = { '\0' }; const char *logfname = "raf2daf.log"; int logMismatch = 0, logAA = 0; @@ -258,10 +72,6 @@ int main(int argc, char **argv) { static struct option myopts[] = { // {char *name, int has_arg, int *flag, int val} - {"bootfile", required_argument, 0, 'f'}, - {"bootreps", required_argument, 0, 'r'}, - {"blocksize", required_argument, 0, 'b'}, - {"singletons", no_argument, 0, '1'}, {"logMismatch", no_argument, 0, 'm'}, {"logAA", no_argument, 0, 'A'}, {"help", no_argument, 0, 'h'}, @@ -271,50 +81,20 @@ int main(int argc, char **argv) { // command line arguments for(;;) { - i = getopt_long(argc, argv, "b:c:f:hr:t:mAv1", myopts, &optndx); + i = getopt_long(argc, argv, "hmAV", myopts, &optndx); if(i == -1) break; switch (i) { - case ':': - case '?': - usage(); - break; - case 'b': - blocksize = strtod(optarg, NULL); - if(blocksize <= 0) { - fprintf(stderr, - "%s:%d: bad argument to -b or --blocksize: \"%s\"\n", - __FILE__, __LINE__, optarg); - usage(); - } - break; - case 'f': - status = snprintf(bootfname, sizeof bootfname, "%s", optarg); - if(status >= sizeof bootfname) { - fprintf(stderr, "%s:%d: ERR: Filename %s is too large." - " Max: %zu\n", - __FILE__, __LINE__, optarg, sizeof(bootfname) - 1); - exit(EXIT_FAILURE); - } - break; case 'V': printf("raf2daf version %s\n", VERSION); return 0; - case 'h': - usage(); - break; - case 'r': - bootreps = strtol(optarg, NULL, 10); - break; - case '1': - doSing = 1; - break; case 'm': logMismatch = 1; break; case 'A': logAA = 1; break; + case 'h': default: usage(); } @@ -326,39 +106,40 @@ int main(int argc, char **argv) { if(n == 0) usage(); - char *poplbl[n]; - char *fname[n]; - LblNdx lndx; - LblNdx_init(&lndx); + char *ifname[n]; + FILE *ofp[m]; RAFReader *r[n]; - // Number of inputs can't exceed number of bits in an object of - // type tipId_t. - if(m > 8 * sizeof(tipId_t)) { - fprintf(stderr, "Error: %d input files. Max is %lu.\n", - n, 8*sizeof(tipId_t) + 1); - usage(); - } - // Parse remaining arguments, each of which should be of form - // x=foo, where x is an arbitrary label and foo is the name of an - // input file. Last label must be "outgroup". + // Parse remaining arguments, each of which should be the name of + // an input file. for(i = 0; i < n; ++i) { - fname[i] = poplbl[i] = argv[i + optind]; - (void) strsep(fname + i, "="); - if(fname[i] == NULL - || poplbl[i] == NULL - || strlen(poplbl[i]) == 0 - || strlen(fname[i]) == 0 || strchr(poplbl[i], ':') != NULL) - usage(); - if(i < m) - LblNdx_addSamples(&lndx, 1, poplbl[i]); - r[i] = RAFReader_new(fname[i]); - } - if(0 != strcmp("outgroup", poplbl[n-1])) { - fprintf(stderr,"%s:%d: last label is \"%s\"" - " instead of \"outgroup\".\n", - __FILE__,__LINE__, poplbl[n-1]); - usage(); + ifname[i] = argv[i + optind]; + r[i] = RAFReader_new(ifname[i]); + if(i == n-1) + continue; + char *start = strrchr(ifname[i], '/'); + if(start == NULL) + start = ifname[i]; + else + ++start; + char *ofname = strdup(start); + int k = strlen(ofname); + if(0 != strcmp("raf", ofname + k-3)) { + fprintf(stderr,"%s:%d: input files should end with \"raf\".\n", + __FILE__,__LINE__); + fprintf(stderr," got \"%s\".\n", ofname); + exit(EXIT_FAILURE); + } + ofname[k-3] = 'd'; + ofp[i] = fopen(ofname, "w"); + if(ofp[i] == NULL) { + fprintf(stderr,"%s:%d: can't open \"%s\" for output.\n", + __FILE__,__LINE__, ofname); + exit(EXIT_FAILURE); + } + fprintf(stderr,"writing to %s\n", ofname); + fprintf(ofp[i], "#%3s %10s %2s %2s %20s\n", + "chr", "pos", "aa", "da", "daf"); } if(logMismatch || logAA) { @@ -369,137 +150,11 @@ int main(int argc, char **argv) { } } - // Default boot file name - if(bootfname[0] == '\0') { - const char *defName = "raf2daf.boot"; - status = snprintf(bootfname, sizeof bootfname, "%s", defName); - if(status >= sizeof bootfname) { - fprintf(stderr, "%s:%d: ERR: Filename %s is too large." - " Max: %zu\n", - __FILE__, __LINE__, defName, sizeof(bootfname) - 1); - exit(EXIT_FAILURE); - } - } - printf("# raf2daf version %s\n", VERSION); - printf("# Population labels:\n"); - for(i = 0; i < n; ++i) - printf("# %s=%s\n", poplbl[i], fname[i]); - - // make sure labels are all different - for(i = 1; i < n; ++i) - for(j = 0; j < i; ++j) - if(0 == strcmp(poplbl[i], poplbl[j])) { - fprintf(stderr, "ERR: duplicate labels on command line.\n"); - fprintf(stderr, " duplicated label: %s\n", poplbl[i]); - exit(EXIT_FAILURE); - } - - unsigned long npat = (1UL << m) - 2UL; // number of site patterns - if(!doSing) - npat -= m; - printf("# %s singleton site patterns.\n", - (doSing ? "Including" : "Excluding")); - printf("# Number of site patterns: %lu\n", npat); - tipId_t pat[npat]; - double patCount[npat]; - int lblsize = 100; - char lblbuff[lblsize]; - memset(patCount, 0, sizeof(patCount)); - - { - // Stack is a interface to array "pat". - Stack *stk = Stack_new(npat, pat); - - // Put site patterns into array "pat". - generatePatterns(0, m, stk, 0, doSing); - Stack_free(stk); - } - - // Sort site patterns. Major sort is by number of "on" bits, - // so that doubleton patterns come first, then tripletons, ets. - // Secondary sort is by order in which labels are listed - // on the command line. - qsort(pat, (size_t) npat, sizeof(pat[0]), compare_tipId); - fflush(stdout); - - // Used by bootstrap - Boot *boot = NULL; - int nchr = 0; - char prev[RAFSTRSIZE], chr[RAFSTRSIZE] = { '\0' }; - long nsnp[MAXCHR]; - memset(nsnp, 0, sizeof nsnp); - - // Read the data to get dimensions: number of chromosomes and - // number of snps per chromosome. Then use these dimensions to - // allocate a bootstrap object. - if(bootreps > 0) { - fprintf(stderr, "Doing 1st pass through data to get dimensions...\n"); - - // First pass through data sets values of - // nchr - // nsnp[i] {i=0..nchr-1} - done=0; - while(!done) { - status = RAFReader_multiNext(n, r); - if(status==0) - status = RAFReader_findDaf(n, r); - switch(status) { - case 0: - break; - case EOF: - done=1; - continue; - case REF_MISMATCH: - case MULTIPLE_ALT: - case NO_ANCESTRAL_ALLELE: - continue; - default: - // something wrong. - mystrerror_r(status, errbuff, sizeof errbuff); - fprintf(stderr,"%s:%d: input error (%s)\n", - __FILE__,__LINE__, errbuff); - exit(EXIT_FAILURE); - } - - assert(strlen(RAFReader_chr(r[0])) < sizeof prev); - strcpy(prev, chr); - strcpy(chr, RAFReader_chr(r[0])); - int diff = strcmp(prev, chr); - if(diff != 0) { - StrInt_insert(strint, chr, nchr); - nsnp[nchr] = 1; - ++nchr; - } else - ++nsnp[nchr - 1]; - } - - for(i = 0; i < n; ++i) { - status = RAFReader_rewind(r[i]); - if(status) { - fprintf(stderr, "%s:%d: ERR: can't rewind input stream.\n", - __FILE__, __LINE__); - fprintf(stderr, " If --bootreps > 0, inputs must be" - " files, not pipes.\n"); - exit(EXIT_FAILURE); - } - } - - // Allocate Boot structure - gsl_rng *rng = gsl_rng_alloc(gsl_rng_taus); - gsl_rng_set(rng, (unsigned long) time(NULL)); - boot = Boot_new(nchr, nsnp, bootreps, npat, blocksize, rng); - gsl_rng_free(rng); - CHECKMEM(boot); - } unsigned long nsites = 0, nbadaa = 0, nbadref=0, nmultalt=0; - long snpndx = -1; // Iterate through raf files - fprintf(stderr, "Doing %s pass through data to tabulate patterns..\n", - bootreps > 0 ? "2nd" : "single"); - int chrndx = -1, currChr = INT_MAX; RAFReader_clearChromosomes(n, r); done=0; while( !done ) { @@ -541,69 +196,23 @@ int main(int argc, char **argv) { exit(EXIT_FAILURE); } - if(bootreps > 0) { - // chrndx is index of current chromosome - errno = 0; - chrndx = StrInt_get(strint, RAFReader_chr(r[0])); - if(errno) { - fprintf(stderr, - "%s:%d: ERR: missing index for chromosome: %s\n", - __FILE__, __LINE__, RAFReader_chr(r[0])); - exit(EXIT_FAILURE); - } - if(chrndx != currChr) { - currChr = chrndx; - snpndx = 0; - } else - ++snpndx; - -#ifndef NDEBUG - assert(snpndx < nsnp[chrndx]); -#endif - } - - // p and q are frequencies of derived and ancestral alleles - double p[m], q[m]; for(j = 0; j < m; ++j) { - p[j] = RAFReader_daf(r[j]); // derived allele freq - q[j] = 1.0 - p[j]; - } - - // Contribution of current snp to each site pattern. Inner - // loop considers each bit in current pattern. If that bit is - // on, multiply z by the derived allele frequency, p. If - // that bit is off, multiply by q=1-p. In the end, z is Prod - // p[j]^bit[j] * q[j]^(1-bit[j]) where bit[j] is the value (0 - // or 1) of the j'th bit. - for(i = 0; i < npat; ++i) { - tipId_t pattern = pat[i]; - double z = 1.0; - for(j = 0; j < m; ++j) { - if(pattern & 1u) - z *= p[j]; - else - z *= q[j]; - pattern >>= 1u; - } - if(!isfinite(z)) { - fprintf(stderr, "%s:%d nonfinite z=%lf\n", - __FILE__, __LINE__, z); - fprintf(stderr, " pattern=%d\n", pat[i]); - for(j = 0; j < m; ++j) - fprintf(stderr, " %d: p=%lf q=%lf\n", j, p[j], q[j]); - } - assert(0 == (pattern & 1)); - patCount[i] += z; - if(bootreps > 0) { - assert(snpndx >= 0); - assert(chrndx >= 0); - Boot_add(boot, chrndx, snpndx, i, z); + // frequencies of reference and derived alleles + double raf = RAFReader_raf(r[j]); + double daf = RAFReader_daf(r[j]); + const char *ancestral, *derived; + if(raf == daf) { + derived = RAFReader_ref(r[j]); + ancestral = RAFReader_alt(r[j]); + }else{ + derived = RAFReader_alt(r[j]); + ancestral = RAFReader_ref(r[j]); } + fprintf(ofp[j], "%4s %10lu %2s %2s %20.18f\n", + RAFReader_chr(r[j]), + RAFReader_nucpos(r[j]), + ancestral, derived, daf); } -#ifndef NDEBUG - if(bootreps > 0) - Boot_sanityCheck(boot, __FILE__, __LINE__); -#endif } printf("# Aligned sites : %lu\n", nsites); if(nbadref) @@ -615,67 +224,10 @@ int main(int argc, char **argv) { printf("# Sites used : %lu\n", nsites - nbadaa - nbadref - nmultalt); - // boottab[i][j] is the count of the j'th site pattern - // in the i'th bootstrap replicate. - double bootvals[bootreps]; - double boottab[bootreps][npat]; - memset(boottab, 0, sizeof boottab); - - if(bootreps > 0) { - printf("# %s = %s\n", "bootstrap output file", bootfname); - printf("# %s = %4.2lf%%\n", "confidence level", 100 * conf); -#ifndef NDEBUG - Boot_sanityCheck(boot, __FILE__, __LINE__); -#endif - // put site pattern counts into matrix boottab. - for(i = 0; i < bootreps; ++i) - Boot_aggregate(boot, i, npat, boottab[i]); - - // write an output file for each bootstrap replicate - for(j = 0; j < bootreps; ++j) { - char buff[FILENAMESIZE + 3]; - status = snprintf(buff, sizeof buff, "%s%03d", bootfname, j); - if(status >= sizeof buff) - DIE("buffer overflow in snprintf"); - - FILE *fp = fopen(buff, "w"); - if(fp == NULL) { - fprintf(stderr,"%s:%d: can't open \"%s\" for output.\n", - __FILE__,__LINE__,buff); - exit(EXIT_FAILURE); - } - fprintf(fp, "# %13s %20s", "Raf2daf", "E[count]\n"); - for(i = 0; i < npat; ++i) { - fprintf(fp, "%15s %20.7lf\n", - patLbl(lblsize, lblbuff, pat[i], &lndx), - boottab[j][i]); - } - fclose(fp); - } - } - // print labels and binary representation of site patterns - printf("# %13s %20s", "Raf2daf", "E[count]"); - if(bootreps > 0) - printf(" %15s %15s", "loBnd", "hiBnd"); - putchar('\n'); - for(i = 0; i < npat; ++i) { - printf("%15s %20.7lf", - patLbl(lblsize, lblbuff, pat[i], &lndx), patCount[i]); - if(bootreps > 0) { - double lowBnd, highBnd; - for(j = 0; j < bootreps; ++j) - bootvals[j] = boottab[j][i]; - confidenceBounds(&lowBnd, &highBnd, conf, bootreps, bootvals); - printf(" %15.7lf %15.7lf", lowBnd, highBnd); - } - putchar('\n'); - } - for(i = 0; i < n; ++i) RAFReader_free(r[i]); - if(bootreps > 0) - Boot_free(boot); - StrInt_free(strint); + for(i=0; i < m; ++i) + fclose(ofp[i]); if(logfile) fclose(logfile); fprintf(stderr, "raf2daf is finished\n"); From f7c7721e67be67a9900da937c91ed1ad67bfbb0e Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sun, 8 Apr 2018 08:13:33 -0600 Subject: [PATCH 097/101] . --- src/raf2daf.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/raf2daf.c b/src/raf2daf.c index 75a6ab57..799025f4 100644 --- a/src/raf2daf.c +++ b/src/raf2daf.c @@ -94,7 +94,6 @@ int main(int argc, char **argv) { case 'A': logAA = 1; break; - case 'h': default: usage(); } @@ -103,7 +102,7 @@ int main(int argc, char **argv) { // remaining options: input files int n = argc - optind; // number of input files int m = n-1; // number excluding outgroup - if(n == 0) + if(m < 2) usage(); char *ifname[n]; From 1d2be83274dc98f7d1fa97d66104ffa908e8d330 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sun, 8 Apr 2018 10:39:09 -0600 Subject: [PATCH 098/101] raf2daf compiles, links, and runs --- src/Makefile | 7 ++++++- src/raf2daf.c | 21 ++++++++++++--------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/Makefile b/src/Makefile index 57a62efc..ebee7f70 100644 --- a/src/Makefile +++ b/src/Makefile @@ -7,7 +7,8 @@ opt := -DNDEBUG -O3 -finline-functions # For full optimization prof := incl := -I/usr/local/include -I/opt/local/include -targets := legosim legofit tabpat sitepat scrmpat daf raf numcores joinraf +targets := legosim legofit tabpat sitepat scrmpat daf raf numcores joinraf \ + raf2daf pytargets := diverg.py bootci.py flatfile.py axt2raf.py tests := xzeroin xbinary @@ -70,6 +71,10 @@ JOINRAF := joinraf.o rafreader.o error.o tokenizer.o misc.o joinraf : $(JOINRAF) $(CC) $(CFLAGS) -o $@ $(JOINRAF) $(lib) +RAF2DAF := raf2daf.o rafreader.o error.o tokenizer.o misc.o +raf2daf : $(RAF2DAF) + $(CC) $(CFLAGS) -o $@ $(RAF2DAF) $(lib) + SCRMPAT := scrmpat.o misc.o binary.o lblndx.o parkeyval.o scrmreader.o \ tokenizer.o boot.o error.o scrmpat : $(SCRMPAT) diff --git a/src/raf2daf.c b/src/raf2daf.c index 799025f4..79edeb4a 100644 --- a/src/raf2daf.c +++ b/src/raf2daf.c @@ -47,10 +47,11 @@ Systems Consortium License, which can be found in file "LICENSE". static void usage(void); const char *useMsg = - "\nUsage: raf2daf [options] ... \n" + "\nUsage: raf2daf [options] ... \n" " where are input files in raf format, the last of which\n" " should be the outgroup. Output daf files have names like those\n" - " of input files but with raf changed to daf."; + " of input files but with raf changed to daf. At least 3 input\n" + " files are required"; /// Print usage message and die. static void usage(void) { @@ -102,8 +103,10 @@ int main(int argc, char **argv) { // remaining options: input files int n = argc - optind; // number of input files int m = n-1; // number excluding outgroup - if(m < 2) + if(m < 2) { + fprintf(stderr,"At least 3 input files are required\n"); usage(); + } char *ifname[n]; FILE *ofp[m]; @@ -149,7 +152,7 @@ int main(int argc, char **argv) { } } - printf("# raf2daf version %s\n", VERSION); + printf("raf2daf version %s\n", VERSION); unsigned long nsites = 0, nbadaa = 0, nbadref=0, nmultalt=0; @@ -213,14 +216,14 @@ int main(int argc, char **argv) { ancestral, derived, daf); } } - printf("# Aligned sites : %lu\n", nsites); + printf("Aligned sites : %lu\n", nsites); if(nbadref) - printf("# Disagreements about ref allele : %lu\n", nbadref); + printf("Disagreements about ref allele : %lu\n", nbadref); if(nmultalt) - printf("# Sites with multiple alt alleles: %lu\n", nmultalt); + printf("Sites with multiple alt alleles: %lu\n", nmultalt); if(nbadaa) - printf("# Undetermined ancestral allele : %lu\n", nbadaa); - printf("# Sites used : %lu\n", + printf("Undetermined ancestral allele : %lu\n", nbadaa); + printf("Sites used : %lu\n", nsites - nbadaa - nbadref - nmultalt); for(i = 0; i < n; ++i) From d46f8f2e90e02be2f272d4ae29b1b31bdcb3b124 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sun, 8 Apr 2018 10:52:07 -0600 Subject: [PATCH 099/101] raf2daf seems to work. Not tested extensively. --- src/daf.c | 4 ++-- src/raf2daf.c | 4 ++-- test/xrafreader.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/daf.c b/src/daf.c index 489995e6..ee76b4a5 100644 --- a/src/daf.c +++ b/src/daf.c @@ -80,7 +80,7 @@ int main(int argc, char **argv) { long unsigned lastnucpos = 0, nucpos; char lastchr[100] = { '\0' }; - printf("#%3s %10s %2s %2s %20s\n", "chr", "pos", "aa", "da", "daf"); + printf("#%3s %10s %2s %2s %s\n", "chr", "pos", "aa", "da", "daf"); while(1) { if(NULL == fgets(buff, buffsize, stdin)) { break; @@ -325,7 +325,7 @@ int main(int argc, char **argv) { if(aai == 1) x = n - x; double p = x / ((double) n); - printf("%4s %10s %2s %2s %20.18f\n", + printf("%4s %10s %2s %2s %0.18g\n", chr, pos, aa[0], alleles[1 - aai], p); } fprintf(stderr, "daf: %ld good sites; %ld rejected\n", ngood, nbad); diff --git a/src/raf2daf.c b/src/raf2daf.c index 79edeb4a..9b8b62b7 100644 --- a/src/raf2daf.c +++ b/src/raf2daf.c @@ -140,7 +140,7 @@ int main(int argc, char **argv) { exit(EXIT_FAILURE); } fprintf(stderr,"writing to %s\n", ofname); - fprintf(ofp[i], "#%3s %10s %2s %2s %20s\n", + fprintf(ofp[i], "#%3s %10s %2s %2s %s\n", "chr", "pos", "aa", "da", "daf"); } @@ -210,7 +210,7 @@ int main(int argc, char **argv) { derived = RAFReader_alt(r[j]); ancestral = RAFReader_ref(r[j]); } - fprintf(ofp[j], "%4s %10lu %2s %2s %20.18f\n", + fprintf(ofp[j], "%4s %10lu %2s %2s %0.18g\n", RAFReader_chr(r[j]), RAFReader_nucpos(r[j]), ancestral, derived, daf); diff --git a/test/xrafreader.c b/test/xrafreader.c index fe70d5d5..d8b1ca7a 100644 --- a/test/xrafreader.c +++ b/test/xrafreader.c @@ -29,12 +29,12 @@ const char *badInput = const char *tstInput[3] = { "#chr\tpos\tref\talt\traf\n" - "1\t1\ta\t.\t0\n" + "1\t1\ta\tt\t0\n" "10\t1\ta\tt\t5e-1\n" "10\t200\tg\tc\t1e0\n", "#chr\tpos\tref\talt\traf\n" - "1\t1\ta\t.\t0.5\n" + "1\t1\ta\tt\t0.5\n" "1\t2\ta\t.\t0.5\n" "10\t1\ta\tt\t1e-1\n" "10\t200\tg\tc\t1\n", From b6e83fa7da279556c7d0712971bef34ffe90df98 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Sun, 8 Apr 2018 10:53:13 -0600 Subject: [PATCH 100/101] version 1.24 --- src/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/version.h b/src/version.h index c0a8c22c..f1ba6b42 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.23" +#define VERSION "1.24" #endif From 7f3b24687da804536f94ba4412d68c882b1077f3 Mon Sep 17 00:00:00 2001 From: "Alan R. Rogers" Date: Tue, 10 Apr 2018 16:40:44 -0500 Subject: [PATCH 101/101] Version 1.25. Legofit now prints Akaike's information criterion. --- src/legofit.c | 39 ++++++++++++++++++++++++--------------- src/version.h | 2 +- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/src/legofit.c b/src/legofit.c index 56a1b449..4b686a18 100644 --- a/src/legofit.c +++ b/src/legofit.c @@ -552,9 +552,9 @@ int main(int argc, char **argv) { #endif // Observed site pattern frequencies - BranchTab *obs = BranchTab_parse(patfname, &lblndx); + BranchTab *rawObs = BranchTab_parse(patfname, &lblndx); if(doSing) { - if(!BranchTab_hasSingletons(obs)) { + if(!BranchTab_hasSingletons(rawObs)) { fprintf(stderr, "%s:%d: Command line includes singletons " "(-1 or --singletons)\n" " but none are present in \"%s\".\n", @@ -562,7 +562,7 @@ int main(int argc, char **argv) { exit(EXIT_FAILURE); } } else { - if(BranchTab_hasSingletons(obs)) { + if(BranchTab_hasSingletons(rawObs)) { fprintf(stderr, "%s:%d: Command line excludes singletons " "(neither -1 nor --singletons)\n" " but singletons are present in \"%s\".\n", @@ -570,6 +570,7 @@ int main(int argc, char **argv) { exit(EXIT_FAILURE); } } + BranchTab *obs = BranchTab_dup(rawObs); #if COST==KL_COST BranchTab_normalize(obs); #endif @@ -622,6 +623,23 @@ int main(int argc, char **argv) { DEStatus destat = diffev(dim, estimate, &cost, &yspread, dep, rng); + // Get mean site pattern branch lengths + if(GPTree_setParams(gptree, dim, estimate)) { + fprintf(stderr, "%s:%d: free params violate constraints\n", + __FILE__, __LINE__); + exit(1); + } + BranchTab *bt = patprob(gptree, simreps, doSing, rng); + BranchTab_divideBy(bt, (double) simreps); + // BranchTab_print(bt, stdout); + + // Calculate AIC + BranchTab *prob = BranchTab_dup(bt); + BranchTab_normalize(prob); + double negLnL = BranchTab_negLnL(rawObs, prob); + double aic = 2.0*negLnL + 2.0*GPTree_nFree(gptree); + BranchTab_free(prob); + const char *whyDEstopped; switch(destat) { case ReachedGoal: @@ -637,22 +655,12 @@ int main(int argc, char **argv) { whyDEstopped = "stopped_for_an_unknown_reason"; } - printf("DiffEv %s. cost=%0.5le spread=%0.5le\n", + printf("DiffEv %s. cost=%0.5le spread=%0.5le", whyDEstopped, cost, yspread); #if COST==LNL_COST printf(" relspread=%e", yspread / cost); #endif - putchar('\n'); - - // Get mean site pattern branch lengths - if(GPTree_setParams(gptree, dim, estimate)) { - fprintf(stderr, "%s:%d: free params violate constraints\n", - __FILE__, __LINE__); - exit(1); - } - BranchTab *bt = patprob(gptree, simreps, doSing, rng); - BranchTab_divideBy(bt, (double) simreps); - // BranchTab_print(bt, stdout); + printf(" AIC=%0.15g\n", aic); printf("Fitted parameter values\n"); #if 1 @@ -687,6 +695,7 @@ int main(int argc, char **argv) { } BranchTab_free(bt); + BranchTab_free(rawObs); BranchTab_free(obs); gsl_rng_free(rng); GPTree_sanityCheck(gptree, __FILE__, __LINE__); diff --git a/src/version.h b/src/version.h index f1ba6b42..46299586 100644 --- a/src/version.h +++ b/src/version.h @@ -1,3 +1,3 @@ #ifndef VERSION -#define VERSION "1.24" +#define VERSION "1.25" #endif