diff --git a/.gitignore b/.gitignore index 831996c4..44fbbfbf 100644 --- a/.gitignore +++ b/.gitignore @@ -2,10 +2,13 @@ *~ core depend +raf +version.h xboot xdtnorm xexopar xhashtab +xerror xgene xgptree xparse @@ -24,12 +27,14 @@ xparkeyval xdiffev xjobqueue xdafreader +xrafreader xmisc xstrint legosim numcores legofit tabpat +sitepat tabpat.log daf try diff --git a/src/Makefile b/src/Makefile index 6a3189c7..c7da8467 100644 --- a/src/Makefile +++ b/src/Makefile @@ -7,7 +7,7 @@ opt := -DNDEBUG -O3 -finline-functions # For full optimization prof := incl := -I/usr/local/include -I/opt/local/include -targets := legosim legofit tabpat daf numcores +targets := legosim legofit tabpat sitepat daf raf numcores pytargets := diverg.py bootci.py flatfile.py tests := xzeroin xbinary @@ -43,28 +43,40 @@ test : $(tests) -./xzeroin @echo "ALL UNIT TESTS WERE COMPLETED." +version.h: + ./mkversion.py > version.h + LEGOSIM := legosim.o patprob.o gptree.o binary.o jobqueue.o misc.o parse.o \ branchtab.o popnodetab.o lblndx.o tokenizer.o parstore.o parkeyval.o \ - popnode.o gene.o dprintf.o rngseed.o dtnorm.o + popnode.o gene.o dprintf.o rngseed.o dtnorm.o tinyexpr.o legosim : $(LEGOSIM) $(CC) $(CFLAGS) -o $@ $(LEGOSIM) $(lib) LEGOFIT := legofit.o patprob.o gptree.o binary.o jobqueue.o misc.o \ parse.o branchtab.o popnodetab.o lblndx.o tokenizer.o parstore.o \ parkeyval.o popnode.o gene.o cost.o diffev.o dprintf.o rngseed.o \ - simsched.o dtnorm.o + simsched.o dtnorm.o tinyexpr.o legofit : $(LEGOFIT) $(CC) $(CFLAGS) -o $@ $(LEGOFIT) $(lib) TABPAT := tabpat.o misc.o binary.o lblndx.o parkeyval.o dafreader.o \ - tokenizer.o strint.o boot.o + tokenizer.o strint.o boot.o error.o tabpat : $(TABPAT) $(CC) $(CFLAGS) -o $@ $(TABPAT) $(lib) +SITEPAT := sitepat.o misc.o binary.o lblndx.o parkeyval.o rafreader.o \ + tokenizer.o strint.o boot.o error.o +sitepat : $(SITEPAT) + $(CC) $(CFLAGS) -o $@ $(SITEPAT) $(lib) + DAF := daf.o misc.o daf : $(DAF) $(CC) $(CFLAGS) -o $@ $(DAF) $(lib) +RAF := raf.o misc.o +raf : $(RAF) + $(CC) $(CFLAGS) -o $@ $(RAF) $(lib) + NUMCORES := numcores.o misc.o numcores : $(NUMCORES) $(CC) $(CFLAGS) -o $@ $(NUMCORES) $(lib) diff --git a/src/README.md b/src/README.md index 8d9720c6..da334615 100644 --- a/src/README.md +++ b/src/README.md @@ -6,11 +6,16 @@ Legofit is a computer package that uses counts of nucleotide site patterns to estimate the history of population size, subdivision, and gene flow. The package consists of the following programs -* @ref daf "daf", which writes genetic data into the ".daf" format - that is used by @ref tabpat "tabpat". +* @ref daf "daf", which writes genetic data into the ".daf" format, + which is used by @ref tabpat "tabpat". * @ref tabpat "tabpat", which reads ".daf" files for several populations, tabulates "nucleotide site patterns" (explained below), and generates moving-blocks bootstrap replicates. +* @ref raf "raf", which writes genetic data into the ".raf" format, + which is used by @ref sitepat "sitepat". +* @ref sitepat "sitepat", which reads ".raf" files for several + populations, tabulates "nucleotide site patterns" (explained below), + and generates moving-blocks bootstrap replicates. * @ref legosim "legosim", which predicts site pattern counts from assumptions about population history. * @ref legofit "legofit", which estimates parameters from site pattern @@ -146,12 +151,38 @@ to test the source file `boot.c`. To run all unit tests, type # Genetic input data -Before doing data analysis with `legofit`, you must generate data files -in "daf" format. Such files end with ".daf", which stands for "derived -allele frequency. See the @ref daf "daf" command for instructions on -translating from "vcf" or "bcf" format into "daf". - -The "daf" file is very simple and looks like this: +Before doing data analysis with `legofit`, you must generate data +files in one of two formats: "daf" (for derived allele frequency), or +"raf" (for reference allele frequency). The first of these +alternatives requires input data in which the ancestral allele has +been previously called. The second ("raf") does not. Instead, the +program @ref sitepat "sitepat" uses an outgroup sequence to call +ancestral alleles on the fly. + +## daf format + +This was the only input data format in early versions of the legofit +package. We began moving away from it when we realized that, under +certain circumstances, it can introduce bias. To understand why, +suppose we have data from 4 populations, A, B, C, and D. Suppose +further that D has received gene flow from an unobserved population, +which is distantly related to the 4 observed populations. This gene +flow will introduce derived alleles into D, but at these sites A, B, +and C will be fixed for the ancestral allele. If ancestral allele +calls are based only on A, B, and C, then no ancestral allele calls +will be available for sites at which derived alleles have been +introduced into D. The gene flow into D will therefore be +undetectable. To avoid such problems, it is best to call ancestral +alleles during the process of tabulating site patterns, using the +programs @ref raf "raf" and @ref sitepat "sitepat". Nonetheless, we +retain @ref daf "daf" and @ref tabpat "tabpat" for backwards +compatibility. + +The suffix "daf" stands for "derived allele frequency. See the @ref +daf "daf" command for instructions on translating from "vcf" or "bcf" +format into "daf". + +The "daf" file is simple and looks like this: #chr pos aa da daf 1 752566 g a 0.835294117647058854 @@ -170,13 +201,46 @@ used here to label the columns. The columns are as follows: across files in a given analysis. 4. Ancestral allele. 5. Derived allele. Indels and loci with 3 or more alleles should be - excluded. + excluded. 6. Frequency of the derived allele within the sample. The lines should be sorted lexically by chromosome. Within chromosomes, they should be sorted in ascending numerical order of column 2. There should be no duplicate (chromosome, position) pairs. +## raf format + +The suffix "raf" stands for "reference allele frequency". See the @ref +raf "raf" command for instructions on translating from "vcf" or "bcf" +format into "raf". + +The "raf" file is simple and looks like this: + + #chr pos ref alt raf + 1 752566 g . 0.835294117647058854 + 1 754192 a g 0.858823529411764652 + 1 755225 t a 0.000000000000000000 + 1 755228 t . 0.000000000000000000 + 1 765437 g . 0.000000000000000000 + +The columns are separated by tabs. The first line (beginning with +"#") is an optional comment, which is used here to label the +columns. The columns are as follows: + +1. Character strings that label chromosomes or scaffolds. +2. Position of the site on the chromosome or scaffold, measured in base + pairs. Daf format doesn't care whether nucleotide positions are + numbered beginning with 0 or with 1, provided that they are consistent + across files in a given analysis. +4. Reference allele. +5. Alternate allele. Indels and loci with 3 or more alleles should be + excluded. +6. Frequency of the reference allele within the sample. + +The lines should be sorted lexically by chromosome. Within +chromosomes, they should be sorted in ascending numerical order of +column 2. There should be no duplicate (chromosome, position) pairs. + # Describing population history in an lgo file {#lgo} The ".lgo" format describes the history of population size, @@ -198,10 +262,13 @@ whose names are "zero" and "one" The first is a "time" variable, which I will use for the tips of branches, where time equals 0. I declare it "fixed", which means that -it will not change. The second is a "twoN" variable, which represents -twice the size of a population. When there is only one sample per -population, the sizes of tip populations do not matter, so I set -them all equal to "one". Next, three more time variables named "Txyn", +it will not change. Variable names must begin with a +letter. Subsequent characters may be letters, digits, or underscores. +The second variable declaration begins with "twoN", which indicates +that this variable refers to haploid population size--twice twice the +size of the diploid population. When there is only one sample per +population, the sizes of tip populations don't matter, so I set them +all equal to "one". Next, three more time variables named "Txyn", "Tn", and "Txy". time fixed Tn=1897 # time of Neanderthal admixture @@ -233,11 +300,11 @@ variable "x" just defined will be ignored in what follows. Our measure of population size is twice the effective size of the population, and we define two such variables: - twoN free 2Nn=1e3 # archaic population size - twoN constrained 2Nxy=1e4 + -1.2*Txy # early modern population size + twoN free twoNn=1e3 # archaic population size + twoN constrained twoNxy=1e4 - 1.2*Txy # early modern population size The first of these is a free parameter, but the second is a new -category: "constrained". It defines "2Nxy" as a function of "Txy". +category: "constrained". It defines "twoNxy" as a function of "Txy". Constraints are useful when analysis of bootstrap samples indicates a tight relationship between two or more free parameters. Constraints reduce the number of free parameters and allow more accurate @@ -245,24 +312,24 @@ estimates. In the constraint above, there are only two terms and one independent variable---"Txy". It is legal, however, to use any number of terms and independent variables. For example, we could have written - twoN constrained 2Nxy=1e4 + -1.2*Txy + 0.01*Txy*Txyn # OK + twoN constrained twoNxy=1e4 - 1.2*Txy + 0.01*Txy*Txyn # OK All independent variables must be free parameters, and all must be -defined before the constraint. The terms in the constraint must be -separated by "+". It would *not* be legal to rewrite the constraint -as +defined before the constraint. The parser can recognize complex +mathematical expressions and knows about the standard mathematical +functions. The y'th power of x can be written either as "x^y" or as +"pow(x,y)". The natural log can be written either as "log" or as +"ln". Parentheses are allowed, and operators have the usual +precedence. For example, the following lines are equivalent: - twoN constrained 2Nxy=10000 - 1.2*Txy # ERROR - -Because the "+" sign is used to separate terms, it is not legal to -write a numerical constant as "1e+4". On the other hand, "1e-4" is -fine. + twoN constrained x=exp(a)*pow(b,y) + twoN constrained x=e^a*b^y + twoN constrained x=exp(a + y*log(b)) To spread a constraint across several lines, break the line after a -"+" symbol. For example, the correct version of the statement above -could be written as +"+" symbol. For example, - twoN constrained 2Nxy=1e4 + + twoN constrained twoNxy=1e4 + -1.2*Txy + 0.01*Txy*Txyn # Constraint spread across 3 lines. @@ -294,7 +361,7 @@ The next two lines are similar, and define two other terminal populations: segment y t=zero twoN=one samples=1 # Eurasia - segment n t=Tn twoN=2Nn samples=1 # Neanderthal + segment n t=Tn twoN=twoNn samples=1 # Neanderthal Segment "n" does not end at time zero, but rather at the time, Tn, of Neanderthal admixture. It has one sample, whose date is also Tn. This @@ -304,17 +371,17 @@ assumption for simplicity---this is only an example. There are 3 more segments to declare: segment y2 t=Tn twoN=one # pre-mig eurasia - segment xy t=Txy twoN=2Nxy # early modern - segment xyn t=Txyn twoN=2Nn # ancestral + segment xy t=Txy twoN=twoNxy # early modern + segment xyn t=Txyn twoN=twoNn # ancestral These segments don't have a "samples" component, because none of them have genetic samples. Segment y2 represents the Eurasian population before the episode of admixture. Note that it ends at the same time as segment n. This is necessary, because we will want to mix y2 and n below to model gene flow. Also note that the size of xyn equals -2Nn---the same variable we used in setting the size of segment n. This +twoNn---the same variable we used in setting the size of segment n. This establishes a constraint: the sizes of XYN and N will always be equal, -no matter how the optimizer adjusts the value of 2Nn. +no matter how the optimizer adjusts the value of twoNn. The rest of the .lgo file defines relationships between segments. This involves two statements: "mix" and "derive". Consider the mix @@ -341,15 +408,22 @@ Segments cannot have more than two parents or more than two children. All segments should descend, eventually, from a single root. +The site patterns printed refer only to the segments that contain +samples, and the sort order of site patterns is determined by the +order in which segments are listed in the .lgo file. In the file +discussed above, there are three segments with samples, and these are +in order "x", "y", "n". For this reason, the output will contain a +site pattern labeled "x:y:n" rather than, say, "x:n:y". + Using this .lgo file as input, `legosim -i 10000` produces ############################################################ # legosim: generate site patterns by coalescent simulation # ############################################################ - + # Program was compiled: Jun 8 2017 12:41:14 # Program was run: Tue Jul 11 09:18:00 2017 - + # cmd: legosim -i 10000 input.lgo # nreps : 10000 # input file : input.lgo @@ -382,13 +456,14 @@ See the @ref legosim "legosim" documentation for details. ## Estimating parameters from genetic data This involves several programs. The first step is to generate input -files in ".daf" format, as described above. You will need one .daf -file for each population. See the @ref daf "daf" documentation for -details. - -The next task is to tabulate site pattern counts from the various .daf -files. See the @ref tabpat "tabpat" documentation for details. Tabpat -will generate a small text file, with one row for each site +files in ".daf" or ".raf" format, as described above. You will need +one .daf or .raf file for each population. For .raf, you will in +addition need a .raf file for an outgroup population. See the +documentation of @ref daf "daf" and @ref raf "raf" for details. + +The next task is to tabulate site pattern counts. For details, see +@ref tabpat "tabpat" or @ref sitepat "sitepat". Either of these +programs will generate a small text file, with one row for each site pattern. If there are 4 populations in the analysis, there will be 10 site patterns. @@ -402,11 +477,11 @@ and the number of simulation replicates used to approximate the objective function. To generate a bootstrap confidence interval, use the `--bootreps` -option of @ref tabpat "tabpat". This will generate not only the -primary output (written to standard output), but also an additional -output file for each bootstrap replicate. For example, `--bootreps 50` -would generate 51 output files: 1 for the normal output, and 1 for -each bootstrap replicate. +option of @ref tabpat "tabpat" or @ref sitepat "sitepat". This will +generate not only the primary output (written to standard output), but +also an additional output file for each bootstrap replicate. For +example, `--bootreps 50` would generate 51 output files: 1 for the +normal output, and 1 for each bootstrap replicate. These files should each be analyzed with a separate run of `legofit`. If you have access to a compute cluster, these jobs can be @@ -432,8 +507,22 @@ the contribution of each site pattern. Thus, it will tell you which site patterns are responsible for a poor fit between observed and expected site pattern frequencies. +## Sort order of site patterns + +The sort order of site patterns is determined by the order of command +line arguments to @ref tabpat "tabpat" and @ref sitepat "sitepat", and +by the order in which segments are defined in the .lgo file. For +example, if "x" precedes "y", then we get a site pattern labeled "xy" +rather than "yx". These inputs should be ordered consistently in +tabpat, sitepat, and .lgo, for otherwise it will be hard to compare +observed site pattern frequencies with those predicted by @ref legosim +"legosim" or @ref legofit "legofit". I recommend arranging them in the +order that populations labels appear in your figures. + # Bias in `legofit` +## Biases arising from constraints + Biases arise in `legofit` because of the constraint that a child population cannot be older than its parent. If the parent's age is fixed, then it represents an inequality constraint on the age of the @@ -456,3 +545,19 @@ This bias is exacerbated if the parent's age is modeled as a Gaussian variable, because then the constraint is an interval rather than a point, and the optimizer encounters it sooner. For this reason, I have not used Gaussian variables in recent work. + +## Bias arising from selection of variant sites + +Genetic data are often restricted to sites that are polymorphic within +some group of samples. For example, the 1000-Genomes Project +distributes files containing genotypes that are polymorphic within +modern humans. These data do not include sites at which the derived +allele is present only in Neanderthals and/or Denisovans. An analysis +using such data would be biased, because it would undercount the +corresponding site patterns. + +To avoid this bias, it is best to work with complete genomes rather +than just the variant sites. Alternatively, one could call variant +sites jointly using all genomes to be included in the analysis. + + diff --git a/src/branchtab.c b/src/branchtab.c index ab57afe1..2d0f8116 100644 --- a/src/branchtab.c +++ b/src/branchtab.c @@ -760,19 +760,19 @@ const char *tstInput = "time free Tc=1\n" "time free Tab=3\n" "time free Tabc=5.5\n" - "twoN free 2Na=100\n" - "twoN fixed 2Nb=123\n" - "twoN free 2Nc=213.4\n" - "twoN fixed 2Nbb=32.1\n" - "twoN free 2Nab=222\n" - "twoN fixed 2Nabc=1.2e2\n" + "twoN free twoNa=100\n" + "twoN fixed twoNb=123\n" + "twoN free twoNc=213.4\n" + "twoN fixed twoNbb=32.1\n" + "twoN free twoNab=222\n" + "twoN fixed twoNabc=1.2e2\n" "mixFrac free Mc=0.02\n" - "segment a t=T0 twoN=2Na samples=1\n" - "segment b t=T0 twoN=2Nb samples=1\n" - "segment c t=Tc twoN=2Nc samples=1\n" - "segment bb t=Tc twoN=2Nbb\n" - "segment ab t=Tab twoN=2Nab\n" - "segment abc t=Tabc twoN=2Nabc\n" + "segment a t=T0 twoN=twoNa samples=1\n" + "segment b t=T0 twoN=twoNb samples=1\n" + "segment c t=Tc twoN=twoNc samples=1\n" + "segment bb t=Tc twoN=twoNbb\n" + "segment ab t=Tab twoN=twoNab\n" + "segment abc t=Tabc twoN=twoNabc\n" "mix b from bb + Mc * c\n" "derive a from ab\n" "derive bb from ab\n" diff --git a/src/cost.c b/src/cost.c index edffaaf0..a2bf6f4b 100644 --- a/src/cost.c +++ b/src/cost.c @@ -40,7 +40,8 @@ double costFun(int dim, double x[dim], void *jdata, void *tdata) { long nreps = SimSched_getSimReps(cp->simSched); DPRINTF(("%s:%d: nreps=%ld\n",__FILE__,__LINE__,nreps)); - GPTree_setParams(cp->gptree, dim, x); + if(GPTree_setParams(cp->gptree, dim, x)) + return HUGE_VAL; if(!GPTree_feasible(cp->gptree, 0)) return HUGE_VAL; diff --git a/src/daf.c b/src/daf.c index 0e306afc..08ed67de 100644 --- a/src/daf.c +++ b/src/daf.c @@ -71,10 +71,10 @@ int main(int argc, char **argv) { // Keep track of the number of sites at which the number // of reference, alternate, or ancestral alleles differs from 0 - int zeroref = 0, zeroalt = 0, zeroaa = 0, zerogtype = 0; - int missref = 0, missaa = 0; - int multref = 0, multalt = 0, multaa = 0; - int nbad = 0, ngood = 0; + long int zeroref = 0, zeroalt = 0, zeroaa = 0, zerogtype = 0; + long int missref = 0, missaa = 0; + long int multref = 0, multalt = 0, multaa = 0; + long int nbad = 0, ngood = 0; int ok; // is current line acceptable long unsigned lastnucpos = 0, nucpos; char lastchr[100] = { '\0' }; @@ -115,7 +115,6 @@ int main(int argc, char **argv) { (void) stripchr(reftoken, ' '); (void) stripchr(alttoken, ' '); (void) stripchr(aatoken, ' '); - (void) stripchr(aatoken, ' '); // Check sort of chromosomes if(*lastchr) { @@ -148,7 +147,7 @@ int main(int argc, char **argv) { assert(lastnucpos == 0); } - // Check sort of nucleotice positions + // Check sort of nucleotide positions if(lastnucpos) { if(lastnucpos == nucpos) { fprintf(stderr, "%s:%d: Duplicate: chr=%s pos=%lu\n", @@ -309,32 +308,31 @@ int main(int argc, char **argv) { printf("%4s %10s %2s %2s %20.18f\n", chr, pos, aa[0], alleles[1 - aai], p); } - - fprintf(stderr, "daf: %d good sites; %d rejected\n", ngood, nbad); + fprintf(stderr, "daf: %ld good sites; %ld rejected\n", ngood, nbad); if(zeroref) - fprintf(stderr, "daf: bad sites with 0 ref alleles: %d\n", zeroref); + fprintf(stderr, "daf: bad sites with 0 ref alleles: %ld\n", zeroref); if(zeroalt) - fprintf(stderr, "daf: bad sites with 0 alt alleles: %d\n", zeroalt); + fprintf(stderr, "daf: bad sites with 0 alt alleles: %ld\n", zeroalt); if(zeroaa) - fprintf(stderr, "daf: bad sites with 0 ancestral alleles: %d\n", + fprintf(stderr, "daf: bad sites with 0 ancestral alleles: %ld\n", zeroaa); if(zerogtype) - fprintf(stderr, "daf: bad sites with 0 genotypes: %d\n", zerogtype); + fprintf(stderr, "daf: bad sites with 0 genotypes: %ld\n", zerogtype); if(multref) - fprintf(stderr, "daf: bad sites with multiple ref alleles: %d\n", + fprintf(stderr, "daf: bad sites with multiple ref alleles: %ld\n", multref); if(multalt) - fprintf(stderr, "daf: bad sites with multiple alt alleles: %d\n", + fprintf(stderr, "daf: bad sites with multiple alt alleles: %ld\n", multalt); if(multaa) fprintf(stderr, - "daf: bad sites with multiple ancestral alleles: %d\n", + "daf: bad sites with multiple ancestral alleles: %ld\n", multaa); if(missref) - fprintf(stderr, "daf: bad sites with missing ref alleles: %d\n", + fprintf(stderr, "daf: bad sites with missing ref alleles: %ld\n", missref); if(missaa) - fprintf(stderr, "daf: bad sites with missing ancestral alleles: %d\n", + fprintf(stderr, "daf: bad sites with missing ancestral alleles: %ld\n", missaa); return 0; } diff --git a/src/dafreader.c b/src/dafreader.c index b540a030..d3e54c60 100644 --- a/src/dafreader.c +++ b/src/dafreader.c @@ -2,13 +2,14 @@ @file dafreader.c @brief Class DAFReader: read a daf file. - @copyright Copyright (c) 2016, Alan R. Rogers + @copyright Copyright (c) 2016, Alan R. Rogers . This file is released under the Internet Systems Consortium License, which can be found in file "LICENSE". */ #include "dafreader.h" #include "tokenizer.h" #include "misc.h" +#include "error.h" #include #include #include @@ -63,9 +64,8 @@ int iscomment(const char *s) { return rval; } -/// Read the next snp. -/// @return 0 on success; EOF on end of file; aborts with message if -/// other errors occur. +/// Read the next site. +/// @return 0 on success; EOF on end of file; abort on other errors. int DAFReader_next(DAFReader * self) { int ntokens1; int ntokens; @@ -80,7 +80,7 @@ int DAFReader_next(DAFReader * self) { if(NULL == strchr(buff, '\n') && !feof(self->fp)) { fprintf(stderr, "%s:%d: Buffer overflow. size=%zu\n", __FILE__, __LINE__, sizeof(buff)); - exit(EXIT_FAILURE); + return BUFFER_OVERFLOW; } if(iscomment(buff)) continue; @@ -97,7 +97,7 @@ int DAFReader_next(DAFReader * self) { fprintf(stderr, "buff: %s\n", buff); Tokenizer_printSummary(self->tkz, stderr); Tokenizer_print(self->tkz, stderr); - exit(EXIT_FAILURE); + return BAD_DAF_INPUT; } ++self->snpid; @@ -111,7 +111,7 @@ int DAFReader_next(DAFReader * self) { if(status >= sizeof self->chr) { fprintf(stderr, "%s:%d: chromosome name too long: %s\n", __FILE__, __LINE__, Tokenizer_token(self->tkz, 0)); - exit(EXIT_FAILURE); + return BUFFER_OVERFLOW; } int diff = strcmp(prev, self->chr); if(diff > 0) { @@ -121,7 +121,7 @@ int DAFReader_next(DAFReader * self) { prev, self->chr); Tokenizer_printSummary(self->tkz, stderr); Tokenizer_print(self->tkz, stderr); - exit(EXIT_FAILURE); + return BAD_SORT; } else if(diff < 0) { // new chromosome prevnucpos = 0UL; @@ -133,12 +133,12 @@ int DAFReader_next(DAFReader * self) { if(prevnucpos == self->nucpos) { fprintf(stderr, "%s:%d: Duplicate line in daf file. chr=%s pos=%lu\n", __FILE__, __LINE__, self->chr, self->nucpos); - exit(1); + return BAD_SORT; } else if(prevnucpos > self->nucpos) { fprintf(stderr, "%s:%d: positions missorted chr=%s " "prev=%lu curr=%lu\n", __FILE__, __LINE__, self->chr, prevnucpos, self->nucpos); - exit(1); + return BAD_SORT; } // Ancestral allele status = snprintf(self->aa, sizeof(self->aa), "%s", @@ -150,12 +150,24 @@ int DAFReader_next(DAFReader * self) { strlowercase(self->da); // Derived allele frequency - self->p = strtod(Tokenizer_token(self->tkz, 4), NULL); + char *token, *end; + token = Tokenizer_token(self->tkz, 4); + errno=0; + self->p = strtod(token, &end); + if(end==token) + errno = EINVAL; + if(errno) { + char err_buff[50]; + strerror_r(errno, err_buff, sizeof(err_buff)); + fprintf(stderr,"%s:%d: Bad float \"%s\" (%s); chr=%s pos=%lu\n", + __FILE__,__LINE__, token, err_buff, self->chr, self->nucpos); + return errno; + } return 0; } -/// Rewind daf file. +/// Rewind daf file. /// @return 0 on success; -1 on failure int DAFReader_rewind(DAFReader * self) { return fseek(self->fp, 0L, SEEK_SET); @@ -178,13 +190,14 @@ int DAFReader_multiNext(int n, DAFReader * r[n]) { // chromosome values in lexical sort order, and // set boolean flag, onSameChr, which indicates // whether all readers are on same chromosome. - if(EOF == DAFReader_next(r[0])) - return EOF; + if( (status = DAFReader_next(r[0])) ) + return status; + imaxchr = 0; onSameChr = 1; for(i = 1; i < n; ++i) { - if(EOF == DAFReader_next(r[i])) - return EOF; + if( (status = DAFReader_next(r[i])) ) + return status; diff = strcmp(r[i]->chr, r[imaxchr]->chr); if(diff > 0) { @@ -203,8 +216,8 @@ int DAFReader_multiNext(int n, DAFReader * r[n]) { if(i == imaxchr) continue; while((diff = strcmp(r[i]->chr, r[imaxchr]->chr)) < 0) { - if(EOF == DAFReader_next(r[i])) - return EOF; + if( (status = DAFReader_next(r[i])) ) + return status; } assert(diff >= 0); if(diff > 0) { @@ -225,7 +238,7 @@ int DAFReader_multiNext(int n, DAFReader * r[n]) { status = snprintf(currchr, sizeof currchr, "%s", r[0]->chr); if(status >= sizeof currchr) { fprintf(stderr, "%s:%d: buffer overflow\n", __FILE__, __LINE__); - exit(EXIT_FAILURE); + return BUFFER_OVERFLOW; } // Now get them all on the same position. Have to keep // checking chr in case one file moves to another chromosome. @@ -233,8 +246,8 @@ int DAFReader_multiNext(int n, DAFReader * r[n]) { // Increment each reader so long as we're all on the same // chromosome and the reader's nucpos is low. while(onSameChr && r[i]->nucpos < maxnuc) { - if(EOF == DAFReader_next(r[i])) - return EOF; + if( (status = DAFReader_next(r[i])) ) + return status; diff = strcmp(r[i]->chr, currchr); if(diff != 0) { // Assertion should succeed because DAFReader_next @@ -248,6 +261,21 @@ int DAFReader_multiNext(int n, DAFReader * r[n]) { } while(!onSameChr || minnuc != maxnuc); + // Make sure reference allele isn't fixed in readers. If it's + // fixed, then we can't call the ancestral allele. + double maxp, minp; + maxp = minp = DAFReader_daf(r[0]); + for(i=1; i < n; ++i) { + double p = DAFReader_daf(r[i]); // derived allele freq + minp = fmin(minp, p); + maxp = fmax(maxp, p); + } + if(maxp == 0.0 || minp == 1.0) + return NO_ANCESTRAL_ALLELE; + + if(!DAFReader_allelesMatch(n, r)) + return ALLELE_MISMATCH; + return 0; } diff --git a/src/error.c b/src/error.c new file mode 100644 index 00000000..45f392a1 --- /dev/null +++ b/src/error.c @@ -0,0 +1,43 @@ +#include "error.h" +#include +#include + +int mystrerror_r(int errnum, char *buff, size_t len) { + int status=0, rval=0; + + switch(errnum) { + case NO_ANCESTRAL_ALLELE: + rval = snprintf(buff, len, "No ancestral allele"); + break; + case REF_MISMATCH: + rval = snprintf(buff, len, "Inconsistent REF alleles"); + break; + case ALLELE_MISMATCH: + rval = snprintf(buff, len, "Inconsistent alleles"); + break; + case MULTIPLE_ALT: + rval = snprintf(buff, len, "Multiple ALT alleles"); + break; + case BUFFER_OVERFLOW: + rval = snprintf(buff, len, "Buffer overflow"); + break; + case BAD_RAF_INPUT: + rval = snprintf(buff, len, "Bad .raf input file"); + break; + case BAD_DAF_INPUT: + rval = snprintf(buff, len, "Bad .raf input file"); + break; + case BAD_SORT: + rval = snprintf(buff, len, "Incorrect sort"); + break; + default: + status = strerror_r(errnum, buff, len); + } + if(rval>=len || status!=0) { + fprintf(stderr,"%s:%s:%d: buffer overflow\n", + __FILE__,__func__,__LINE__); + exit(EXIT_FAILURE); + } + + return 0; +} diff --git a/src/error.h b/src/error.h new file mode 100644 index 00000000..46ef822f --- /dev/null +++ b/src/error.h @@ -0,0 +1,18 @@ +#ifndef ERROR_INCLUDED +#define ERROR_INCLUDED + +#include + +// Number from 1000 to avoid conflict with errno +enum {NO_ANCESTRAL_ALLELE=1000, + REF_MISMATCH, + ALLELE_MISMATCH, + MULTIPLE_ALT, + BUFFER_OVERFLOW, + BAD_RAF_INPUT, + BAD_DAF_INPUT, + BAD_SORT}; + +int mystrerror_r(int errnum, char *buff, size_t len); + +#endif diff --git a/src/exception.c b/src/exception.c new file mode 100644 index 00000000..ed09f696 --- /dev/null +++ b/src/exception.c @@ -0,0 +1,43 @@ +#include "exception.h" +#include +#include + +Exception *Exception_new(void) { + Exception *self = malloc(sizeof(Exception)); + if(self==NULL) { + fprintf(stderr, "%s:%s:%d: bad malloc\n", + __FILE__,__func__,__LINE__); + exit(EXIT_FAILURE); + } + self->next = NULL; + self->buff[0] = '\0'; + return self; +} + +Exception *Exception_push(Exception *root, Exception *new) { + Exception *e; + if(root == NULL) + return new; + for(e=root; e; e=e->next) + ; + e->next = new; + return root; +} + +void Exception_abort(Exception *self) { + Exception *e; + fputs("Exception:\n", stderr); + for(e=self; e; e=e->next) { + fputs(" ", stderr); + fputs(e->buff, stderr); + } + Exception_free(self); + exit(EXIT_FAILURE); +} + +void Exception_free(Exception *self) { + if(self==NULL) + return; + Exception_free(self->next); + free(self); +} diff --git a/src/exception.h b/src/exception.h new file mode 100644 index 00000000..247a8fd2 --- /dev/null +++ b/src/exception.h @@ -0,0 +1,16 @@ +#ifndef EXCEPTION_INCLUDED +#define EXCEPTION_INCLUDED +#define EXCEPTION_SIZE 1000 +#include "typedefs.h" + +struct Exception { + struct Exception *next; + char buff[EXCEPTION_SIZE]; +}; + +Exception *Exception_new(void); +Exception *Exception_push(Exception *root, Exception *new); +void Exception_abort(Exception *self); +void Exception_free(Exception *self); + +#endif diff --git a/src/formula.c b/src/formula.c new file mode 100644 index 00000000..396be5dd --- /dev/null +++ b/src/formula.c @@ -0,0 +1,127 @@ +#include +#include + +#define MAXTOKEN 100 + +typedef enum TokenType { + equals, number, operator, lparen, rparen, exponential, logarithm, + identifier +} TokenType; + +typedef enum OpType { + plus, minus, times, divide, power +} OpType; + +typedef struct Token { + int type; + OpType op; + double value; + double *param; // not locally owned + char *identifier; // not locally owned + char str[MAXTOKEN] +} Token; + +Token *Token_new(int type) { + Token *self = malloc(sizeof(Token)); + if(self==NULL) + return NULL; + memset(self, 0, sizeof Token); + return self; +} + +void Token_free(Token *self) { + free(self)/ +} + +Token *Token_next(int n, char *buff, char **after, ParStore *ps, + int *err) { + Token *self; + while(isspace(*buff)) + ++buff; + if(*buff == '\0') { + *err = EOF; + return NULL; + } + + // Is token a number? + errno = 0; + double x = strtod(buff, after); + if(errno) { + *err = errno; + return NULL; + } + if(*after != buff) { + token = Token_new(); + token->type = number; + token->value = x; + return token; + } + + // Is token "exp"? + if(0 == strncmp(buff, "exp", 3) && !isalnum(buff[3])) { + token = Token_new(); + token->type = exponential; + *after = buff+3; + return token; + } + + // Is token "log"? + if(0 == strncmp(buff, "log", 3) && !isalnum(buff[3])) { + token = Token_new(); + token->type = logarithm; + *after = buff+3; + return token; + } + + // Is token an identifier? + char *s = buff; + if(isalpha(*s++)) { + while(*s != '\0' && (isalnum(*s) || *s=='_')) + ++s; + *after = s; + size_t len = s - buf; + if(len > MAXTOKEN-1) { + *err = BUFFER_OVERFLOW; + return NULL; + } + token = Token_new(); + token->type = identifier; + strncpy(self->str, buff, len); + return token; + } + token = Token_new(); + switch(*buff) { + case '=': + token->type = equals; + return token; + case '+': + token->type = operator; + token->op = plus; + return token; + case '-': + token->type = operator; + token->op = minus; + return token; + case '*': + token->type = operator; + token->op = times; + return token; + case '/': + token->type = operator; + token->op = divide; + return token; + case '^': + token->type = operator; + token->op = exponentiate; + return token; + case '(': + token->type = lparen; + return token; + case ')': + token->type = lparen; + return token; + } + Token_free(token); + *err = BAD_CONSTRAINT; + return NULL; +} diff --git a/src/gene.c b/src/gene.c index 19719112..17403a1d 100644 --- a/src/gene.c +++ b/src/gene.c @@ -137,10 +137,6 @@ int main(void) { assert(BranchTab_size(bt) == 1); assert(2.0 == BranchTab_get(bt, (id1|id2))); - Gene_free(g1); - Gene_free(g2); - Gene_free(g3); - Gene_free(g4); Gene_free(g5); BranchTab_free(bt); diff --git a/src/gptree.c b/src/gptree.c index d4e0ebea..8c7cc9e2 100644 --- a/src/gptree.c +++ b/src/gptree.c @@ -41,7 +41,8 @@ struct GPTree { /// Print a description of parameters. void GPTree_printParStore(GPTree * self, FILE * fp) { - ParStore_constrain(self->parstore); + if(ParStore_constrain(self->parstore)) + fprintf(fp,"Warning: free parameters violate constraints\n"); ParStore_print(self->parstore, fp); } @@ -61,10 +62,10 @@ void GPTree_randomize(GPTree * self, gsl_rng * rng) { /// @param[in] n number of parameters in array, which should equal the /// number of free parameters in the GPTree. /// @param[in] x array of parameter values. -void GPTree_setParams(GPTree * self, int n, double x[n]) { +int GPTree_setParams(GPTree * self, int n, double x[n]) { assert(n == ParStore_nFree(self->parstore)); ParStore_setFreeParams(self->parstore, n, x); - ParStore_constrain(self->parstore); + return ParStore_constrain(self->parstore); } /// Copy free parameters from GPTree into an array @@ -93,7 +94,10 @@ int GPTree_nFree(const GPTree * self) { void GPTree_simulate(GPTree * self, BranchTab * branchtab, gsl_rng * rng, unsigned long nreps, int doSing) { unsigned long rep; - ParStore_constrain(self->parstore); + if(ParStore_constrain(self->parstore)) { + fprintf(stderr,"%s:%d: free parameters violate constraints\n", + __FILE__,__LINE__); + } for(rep = 0; rep < nreps; ++rep) { PopNode_clear(self->rootPop); // remove old samples SampNdx_populateTree(&(self->sndx)); // add new samples @@ -163,7 +167,10 @@ void GPTree_free(GPTree * self) { /// Duplicate a GPTree object GPTree *GPTree_dup(const GPTree * old) { assert(old); - ParStore_constrain(old->parstore); + if(ParStore_constrain(old->parstore)) { + fprintf(stderr,"%s:%d: free parameters violate constraints\n", + __FILE__,__LINE__); + } assert(GPTree_feasible(old, 0)); if(old->rootGene != NULL) { fprintf(stderr, "%s:%s:%d: old->rootGene must be NULL on entry\n", @@ -234,7 +241,10 @@ GPTree *GPTree_dup(const GPTree * old) { pthread_mutex_unlock(&outputLock); exit(EXIT_FAILURE); } - ParStore_constrain(new->parstore); + if(ParStore_constrain(new->parstore)) { + fprintf(stderr,"%s:%d: free parameters violate constraints\n", + __FILE__,__LINE__); + } assert(GPTree_feasible(new, 1)); return new; } @@ -266,8 +276,10 @@ int GPTree_equals(const GPTree * lhs, const GPTree * rhs) { __FILE__, __func__, __LINE__); if(!Bounds_equals(&lhs->bnd, &rhs->bnd)) return 0; - if(!ParStore_equals(lhs->parstore, rhs->parstore)) + if(!ParStore_equals(lhs->parstore, rhs->parstore)) { + fprintf(stderr,"%s:%d: !ParStore_equals\n",__FILE__,__LINE__); return 0; + } if(!LblNdx_equals(&lhs->lblndx, &rhs->lblndx)) return 0; if(!SampNdx_equals(&lhs->sndx, &rhs->sndx)) @@ -297,7 +309,8 @@ unsigned GPTree_nsamples(GPTree * self) { /// Are parameters within the feasible region? int GPTree_feasible(const GPTree * self, int verbose) { - ParStore_constrain(self->parstore); + if(ParStore_constrain(self->parstore)) + return 0; return PopNode_feasible(self->rootPop, self->bnd, verbose); } @@ -330,19 +343,19 @@ const char *tstInput = "time free Tc=1\n" "time free Tab=3\n" "time free Tabc=5.5\n" - "twoN free 2Na=100\n" - "twoN fixed 2Nb=123\n" - "twoN free 2Nc=213.4\n" - "twoN fixed 2Nbb=32.1\n" - "twoN free 2Nab=222\n" - "twoN fixed 2Nabc=1.2e2\n" + "twoN free twoNa=100\n" + "twoN fixed twoNb=123\n" + "twoN free twoNc=213.4\n" + "twoN fixed twoNbb=32.1\n" + "twoN free twoNab=222\n" + "twoN fixed twoNabc=1.2e2\n" "mixFrac free Mc=0.02\n" - "segment a t=T0 twoN=2Na samples=1\n" - "segment b t=T0 twoN=2Nb samples=1\n" - "segment c t=Tc twoN=2Nc samples=1\n" - "segment bb t=Tc twoN=2Nbb\n" - "segment ab t=Tab twoN=2Nab\n" - "segment abc t=Tabc twoN=2Nabc\n" + "segment a t=T0 twoN=twoNa samples=1\n" + "segment b t=T0 twoN=twoNb samples=1\n" + "segment c t=Tc twoN=twoNc samples=1\n" + "segment bb t=Tc twoN=twoNbb\n" + "segment ab t=Tab twoN=twoNab\n" + "segment abc t=Tabc twoN=twoNabc\n" "mix b from bb + Mc * c\n" "derive a from ab\n" "derive bb from ab\n" "derive ab from abc\n" "derive c from abc\n"; diff --git a/src/gptree.h b/src/gptree.h index fad90dda..1971c59a 100644 --- a/src/gptree.h +++ b/src/gptree.h @@ -18,7 +18,7 @@ int GPTree_nFree(const GPTree *self); double *GPTree_loBounds(GPTree *self); double *GPTree_upBounds(GPTree *self); unsigned GPTree_nsamples(GPTree *self); -void GPTree_setParams(GPTree *self, int n, double x[n]); +int GPTree_setParams(GPTree *self, int n, double x[n]); void GPTree_getParams(GPTree *self, int n, double x[n]); void GPTree_randomize(GPTree *self, gsl_rng *rng); void GPTree_printParStore(GPTree *self, FILE *fp); diff --git a/src/input.lgo b/src/input.lgo index 082c021a..ff75e72b 100644 --- a/src/input.lgo +++ b/src/input.lgo @@ -6,19 +6,20 @@ twoN fixed one=1 time free Txyn=25920 # archaic-modern separation time time fixed Tn=1897 # time of Neanderthal admixture time free Txy=3788 # Africa-Eurasia separation time -twoN free 2Nn=1e3 # archaic population size +twoN free twoNn=1e3 # archaic population size # For constrained variables, the terms of the formula are separated # by "+" signs, so negative terms must be entered as shown below. # To spread a constraint function across several lines, break # the line after a "+" symbol. -twoN constrained 2Nxy=1e4 + -1.2*Txy # early modern population size +#twoN constrained twoNxy=1e4 - 1.2*Txy # early modern population size +twoN constrained twoNxy=exp(4 - 1.2*log(Txy)) # early modern population size mixFrac free mN=0.02 # Neanderthal admixture into y segment x t=zero twoN=one samples=1 # Africa segment y t=zero twoN=one samples=1 # Eurasia -segment n t=Tn twoN=2Nn samples=1 # Neanderthal +segment n t=Tn twoN=twoNn samples=1 # Neanderthal segment y2 t=Tn twoN=one # pre-mig eurasia -segment xy t=Txy twoN=2Nxy # early modern -segment xyn t=Txyn twoN=2Nn # ancestral +segment xy t=Txy twoN=twoNxy # early modern +segment xyn t=Txyn twoN=twoNn # ancestral mix y from y2 + mN * n # y is a mixture of y2 and n derive x from xy # x is child of xy derive y2 from xy # y2 is child of xy diff --git a/src/legofit.c b/src/legofit.c index 8227ea58..1686d818 100644 --- a/src/legofit.c +++ b/src/legofit.c @@ -129,7 +129,7 @@ reported in the legofit output. To double this value, use "-T 2e-4" or @copyright Copyright (c) 2016, 2017, Alan R. Rogers . This file is released under the Internet Systems Consortium License, which can be found in file "LICENSE". - */ +*/ #include "branchtab.h" #include "cost.h" @@ -205,6 +205,7 @@ void usage(void) { tellopt("-p or --ptsPerDim ", "number of DE points per free var"); tellopt("-1 or --singletons", "Use singleton site patterns"); tellopt("-v or --verbose", "verbose output"); + tellopt("--version", "Print version and exit"); tellopt("-h or --help", "print this message"); exit(1); } @@ -248,13 +249,11 @@ int main(int argc, char **argv) { {"singletons", no_argument, 0, '1'}, {"help", no_argument, 0, 'h'}, {"verbose", no_argument, 0, 'v'}, + {"version", no_argument, 0, 'V'}, {NULL, 0, NULL, 0} }; - printf("########################################\n" - "# legofit: estimate population history #\n" - "########################################\n"); - putchar('\n'); + hdr("legofit: estimate population history"); int i, j; time_t currtime = time(NULL); @@ -347,6 +346,8 @@ int main(int argc, char **argv) { case 'v': verbose = 1; break; + case 'V': + return 0; case 'T': ytol = strtod(optarg, 0); break; @@ -541,7 +542,11 @@ int main(int argc, char **argv) { putchar('\n'); // Get mean site pattern branch lengths - GPTree_setParams(gptree, dim, estimate); + if(GPTree_setParams(gptree, dim, estimate)) { + fprintf(stderr,"%s:%d: free params violate constraints\n", + __FILE__,__LINE__); + exit(1); + } BranchTab *bt = patprob(gptree, simreps, doSing, rng); BranchTab_divideBy(bt, (double) simreps); // BranchTab_print(bt, stdout); diff --git a/src/legosim.c b/src/legosim.c index fcd74ec8..f166eb43 100644 --- a/src/legosim.c +++ b/src/legosim.c @@ -114,6 +114,7 @@ void usage(void) { tellopt("-1 or --singletons", "Use singleton site patterns"); tellopt("-U ", "Mutations per generation per haploid genome."); tellopt("-h or --help", "print this message"); + tellopt("--version", "print version and exit"); exit(1); } @@ -125,13 +126,10 @@ int main(int argc, char **argv) { {"mutations", required_argument, 0, 'U'}, {"singletons", no_argument, 0, '1'}, {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'V'}, {NULL, 0, NULL, 0} }; - - printf("############################################################\n" - "# legosim: generate site patterns by coalescent simulation #\n" - "############################################################\n"); - putchar('\n'); + hdr("legosim: generate site patterns by coalescent simulation"); int i, j; int doSing=0; // nonzero => use singleton site patterns @@ -172,6 +170,8 @@ int main(int argc, char **argv) { case '1': doSing=1; break; + case 'V': + return 0; case 'h': default: usage(); diff --git a/src/misc.c b/src/misc.c index fc939c3e..deb2ff34 100644 --- a/src/misc.c +++ b/src/misc.c @@ -9,6 +9,7 @@ #include "misc.h" #include "binary.h" #include "lblndx.h" +#include "version.h" #include #include #include @@ -495,7 +496,7 @@ char *nextWhitesepToken(char **str) { /// in string delim. Set ptr[i] equal to the address of the i'th /// token. Terminate each substring with '\0' within s. Return the /// number of tokens. Abort if the number of substrings exceeds -/// n. +/// n. int tokenize(int dim, char *token[dim], char *s, const char *delim) { char *t; int n=0; @@ -511,7 +512,7 @@ int tokenize(int dim, char *token[dim], char *s, const char *delim) { return n; } -/// In string s, replace instances of character a with character b. +/// In string s, replace instances of character a with character b. void strReplaceChr(char *s, int a, int b) { while(*s != '\0') { if(*s == a) @@ -530,7 +531,7 @@ double parseDbl(char *token) { errno=0; x = strtod(token, &leftover); if(errno) { - // strdup detected a problem + // strtod detected a problem return 0.0; } if(leftover==token) { @@ -547,3 +548,66 @@ double parseDbl(char *token) { } return x; } + +/// Truncate string to n characters (plus terminating '\0') by +/// removing characters from the left. +char * strltrunc(char *s, int n) { + int w = strlen(s); + if(w <= n) + return s; + + char *u = s, *v = s + (w-n); + while(*v) + *u++ = *v++; + *u = '\0'; + return s; +} + +void hdr(const char *msg) { + int i, wid, len1, len2, status; + char version[100], buff[100]; + status = snprintf(version, sizeof(version), "version %s", VERSION); + if(status >= sizeof(version)) { + fprintf(stderr,"%s:%d: buffer overflow\n",__FILE__,__LINE__); + exit(EXIT_FAILURE); + } + len1 = strlen(msg); + len2 = strlen(version); + wid = 2 + (len1 > len2 ? len1 : len2); + for(i=0; i < 2 + wid; ++i) + putchar('#'); + putchar('\n'); + printf("#%s#\n", strcenter(msg, wid, buff, sizeof(buff))); + printf("#%s#\n", strcenter(version, wid, buff, sizeof(buff))); + for(i=0; i < 2 + wid; ++i) + putchar('#'); + putchar('\n'); + putchar('\n'); +} + +/** + * Center string "text" in a field of width "width". The centered string + * is written into the character string "buff", whose size is + * "buffsize". + */ +char *strcenter(const char *text, unsigned width, + char *buff, size_t buffsize) { + int i, j, lpad = 0, rpad = 0, txtwid; + + txtwid = strlen(text); + if(txtwid >= buffsize) { + snprintf(buff, buffsize, "%s", text); + return buff; + } + if(width > txtwid) + lpad = (width - txtwid) / 2; + rpad = width - txtwid - lpad; + for(i = 0; i < lpad; ++i) + buff[i] = ' '; + for(j = 0; j < txtwid; ++j) + buff[i + j] = text[j]; + for(j = 0; j < rpad; ++j) + buff[i + txtwid + j] = ' '; + buff[lpad + txtwid + rpad] = '\0'; + return buff; +} diff --git a/src/misc.h b/src/misc.h index f5798aa5..7e61d941 100644 --- a/src/misc.h +++ b/src/misc.h @@ -11,6 +11,7 @@ # include static inline int Dbl_near(double x, double y); +static inline int Dbl_equals_allowNonfinite(double x, double y); int compareLongs(const void *void_x, const void *void_y); int compareDoubles(const void *void_x, const void *void_y); int getNumCores(void); @@ -40,6 +41,10 @@ char *nextWhitesepToken(char **str); int tokenize(int dim, char *token[dim], char *s, const char *delim); void strReplaceChr(char *s, int a, int b); double parseDbl(char *token); +char *strltrunc(char *s, int n); +void hdr(const char *msg); +char *strcenter(const char *text, unsigned width, + char *buff, size_t buffsize); static inline double survival(double t, double twoN); @@ -92,6 +97,27 @@ static inline int Dbl_near(double x, double y) { return fabs(x - y) <= fmax(fabs(x), fabs(y)) * 8.0 * DBL_EPSILON; } +/// Return 1 if x==y, 0 otherwise. Unlike the standard comparison, +/// this one returns 1 if both values are nan, if both are positive infinity, +/// or if both are negative infinity. +static inline int Dbl_equals_allowNonfinite(double x, double y) { + if(x == y) + return 1; + + int xtype = fpclassify(x); + int ytype = fpclassify(y); + + if(xtype==FP_NAN && ytype==FP_NAN) + return 1; + if(xtype==FP_INFINITE && ytype==FP_INFINITE) { + if(x>0.0 && y>0.0) + return 1; + if(x<0.0 && y<0.0) + return 1; + } + return 0; +} + /// survival fuction static inline double survival(double t, double twoN) { assert(t >= 0.0); diff --git a/src/mkversion.py b/src/mkversion.py new file mode 100755 index 00000000..dbfbc80c --- /dev/null +++ b/src/mkversion.py @@ -0,0 +1,12 @@ +#!/usr/bin/python +import shlex, subprocess + +cmd = 'git log -n 1 --date=short --format=format:"rev.%ad.%h" HEAD' +args = shlex.split(cmd) +p = subprocess.Popen(args, stdout=subprocess.PIPE) +sha = p.stdout.readline() +p.terminate() + +print "#ifndef VERSION" +print '#define VERSION "%s"' % sha +print "#endif" diff --git a/src/numcores.c b/src/numcores.c index b2e6357b..9dee8ca0 100644 --- a/src/numcores.c +++ b/src/numcores.c @@ -1,6 +1,46 @@ #include "misc.h" #include -int main(void) { - printf("%d\n", getNumCores()); +#include +#include +#include +void usage(void); + +void usage(void) { + fprintf(stderr,"usage: numcores [factor]\n"); + fprintf(stderr," where \"factor\" is a number between 0 and 1.\n"); + fprintf(stderr," Program prints factor times the number of cores,\n"); + fprintf(stderr," rounded to nearest integer.\n"); + fprintf(stderr," It is an error if factor is outside of [0,1].\n"); + fprintf(stderr," Default: factor=1.\n"); + exit(EXIT_FAILURE); +} + +int main(int argc, char **argv) { + double factor; + char *end; + switch(argc) { + case 1: + printf("%d\n", getNumCores()); + break; + case 2: + errno=0; + factor = strtod(argv[1], &end); + if(errno) { + fprintf(stderr,"Bad float: %s (%s)\n", + argv[1], strerror(errno)); + usage(); + }else if(end == argv[1]) { + fprintf(stderr,"Bad float: %s\n", + argv[1]); + usage(); + } + if(factor < 0.0 || factor > 1.0) + usage(); + printf("%0.0lf\n", floor(0.5 + factor * getNumCores())); + break; + default: + usage(); + break; + } return 0; } diff --git a/src/parkeyval.c b/src/parkeyval.c index 5db46dad..d91859c3 100644 --- a/src/parkeyval.c +++ b/src/parkeyval.c @@ -132,22 +132,29 @@ void ParKeyVal_sanityCheck(ParKeyVal *self, const char *file, int ParKeyVal_equals(ParKeyVal *lhs, ParKeyVal *rhs) { if(lhs == NULL && rhs == NULL) return 1; - if(lhs == NULL && rhs != NULL) - return 0; - if(lhs != NULL && rhs == NULL) - return 0; + if(lhs == NULL && rhs != NULL) { + return 0; + } + if(lhs != NULL && rhs == NULL) { + return 0; + } assert(lhs != NULL && rhs != NULL); - if(0 != strcmp(lhs->key, rhs->key)) + if(0 != strcmp(lhs->key, rhs->key)) { return 0; - if(lhs->valPtr == NULL && rhs->valPtr != NULL) + } + if(lhs->valPtr == NULL && rhs->valPtr != NULL) { return 0; - if(lhs->valPtr != NULL && rhs->valPtr == NULL) + } + if(lhs->valPtr != NULL && rhs->valPtr == NULL) { return 0; + } if(lhs->valPtr != NULL && rhs->valPtr != NULL) { - if(*lhs->valPtr != *rhs->valPtr) + if(!Dbl_equals_allowNonfinite(*lhs->valPtr, *rhs->valPtr)) { return 0; - if(lhs->pstat != rhs->pstat) + } + if(lhs->pstat != rhs->pstat) { return 0; + } } return ParKeyVal_equals(lhs->next, rhs->next); } diff --git a/src/parse.c b/src/parse.c index ebf12141..b6f64918 100644 --- a/src/parse.c +++ b/src/parse.c @@ -32,23 +32,23 @@ * * Here is input that would generate the tree above: * - * time fixed T0=0 - * time free Tc=1 - * time free Tab=3 - * time free Tabc=5.5 - * twoN free 2Na=100 - * twoN fixed 2Nb=123 - * twoN free 2Nc=213.4 - * twoN fixed 2Nbb=32.1 - * twoN free 2Nab=222 - * twoN fixed 2Nabc=1.2e2 + * time fixed T0=0 + * time free Tc=1 + * time free Tab=3 + * time free Tabc=5.5 + * twoN free twoNa=100 + * twoN fixed twoNb=123 + * twoN free twoNc=213.4 + * twoN fixed twoNbb=32.1 + * twoN free twoNab=222 + * twoN fixed twoNabc=1.2e2 * mixFrac free Mc=0.8 - * segment a t=T0 twoN=2Na samples=1 - * segment b t=T0 twoN=2Nb samples=2 - * segment c t=Tc twoN=2Nc samples=1 - * segment bb t=Tc twoN=2Nbb - * segment ab t=Tab twoN=2Nab - * segment abc t=Tabc twoN=2Nabc + * segment a t=T0 twoN=twoNa samples=1 + * segment b t=T0 twoN=twoNb samples=2 + * segment c t=Tc twoN=twoNc samples=1 + * segment bb t=Tc twoN=twoNbb + * segment ab t=Tab twoN=twoNab + * segment abc t=Tabc twoN=twoNabc * mix b from bb + Mc * c * derive a from ab * derive bb from ab @@ -171,17 +171,25 @@ void parseParam(char *next, enum ParamType type, char *name = strsep(&next, "="); CHECK_TOKEN(name, orig); name = stripWhiteSpace(name); - if( !isalnum(*name) ) { + int i, ok=1; + if( !isalpha(name[0]) ) + ok = 0; + for(i=1; ok && name[i]!='\0'; ++i) { + int c = name[i]; + if( !(isalnum(c) || c=='_') ) + ok = 0; + } + if(!ok) { fprintf(stderr,"%s:%d: \"%s\" is not a legal parameter name.\n", __FILE__,__LINE__, name); - fprintf(stderr,"input: %s\n", orig); + fprintf(stderr," input: %s\n", orig); exit(EXIT_FAILURE); } char *formula; double value, sd = 0.0; if(pstat == Constrained) { - formula = next; + formula = stripWhiteSpace(next); assert(formula != NULL); }else{ // Read parameter value @@ -217,7 +225,7 @@ void parseParam(char *next, enum ParamType type, exit(EXIT_FAILURE); } if(sd <= 0.0) { - fprintf(stderr,"%s:%s:%d: Error sd =%lg <= 0.0\n", + fprintf(stderr,"%s:%s:%d: Error sd=%lg <= 0.0\n", __FILE__,__func__,__LINE__, sd); fprintf(stderr,"input: %s\n", orig); exit(EXIT_FAILURE); @@ -678,19 +686,19 @@ const char *tstInput = "time free Tc=1\n" "time free Tab=3\n" "time free Tabc=5.5\n" - "twoN free 2Na=100\n" - "twoN fixed 2Nb=123\n" - "twoN free 2Nc=213.4\n" - "twoN fixed 2Nbb=32.1\n" - "twoN constrained 2Nab=100 + -1.2*Tab\n" - "twoN fixed 2Nabc=1.2e2\n" + "twoN free twoNa=100\n" + "twoN fixed twoNb=123\n" + "twoN free twoNc=213.4\n" + "twoN fixed twoNbb=32.1\n" + "twoN constrained twoNab=100 + -1.2*Tab\n" + "twoN fixed twoNabc=1.2e2\n" "mixFrac free Mc=0.02\n" - "segment a t=T0 twoN=2Na samples=1\n" - "segment b t=T0 twoN=2Nb samples=1\n" - "segment c t=Tc twoN=2Nc samples=1\n" - "segment bb t=Tc twoN=2Nbb\n" - "segment ab t=Tab twoN=2Nab\n" - "segment abc t=Tabc twoN=2Nabc\n" + "segment a t=T0 twoN=twoNa samples=1\n" + "segment b t=T0 twoN=twoNb samples=1\n" + "segment c t=Tc twoN=twoNc samples=1\n" + "segment bb t=Tc twoN=twoNbb\n" + "segment ab t=Tab twoN=twoNab\n" + "segment abc t=Tabc twoN=twoNabc\n" "mix b from bb + Mc * c\n" "derive a from ab\n" "derive bb from ab\n" diff --git a/src/parstore.c b/src/parstore.c index e3a14d91..6b96eb6b 100644 --- a/src/parstore.c +++ b/src/parstore.c @@ -25,30 +25,26 @@ #include "parstore.h" #include "parkeyval.h" #include "dtnorm.h" +#include "tinyexpr.h" #include #include #include #include #include -typedef struct Term Term; - -/// One term in a polynomial of form -/// b*x[0]*x[1]*...*x[n-1] -struct Term { - double b; - int n; - char **lbl; - double **x; - Term *next; -}; - -/// Constraint specifying one variable as a linear function of several -/// others. -struct Constraint { - double a; - Term *term; -}; +// Set value of i'th constraint. Abort with an error message if result is +// NaN. +#define SET_CONSTR(i) do { \ + self->constrainedVal[(i)] = te_eval(self->constr[(i)]); \ + if(isnan(self->constrainedVal[(i)])) { \ + fprintf(stderr,"%s:%d: constraint returned NaN\n", \ + __FILE__,__LINE__); \ + fprintf(stderr,"formula: %s = %s\n", \ + self->nameConstrained[(i)], self->formulas[(i)]); \ + ParStore_printFree(self, stderr); \ + exit(EXIT_FAILURE); \ + } \ + }while(0) struct ParStore { int nFixed, nFree, nGaussian, nConstrained; // num pars @@ -65,18 +61,12 @@ struct ParStore { double constrainedVal[MAXPAR]; // parameter values double mean[MAXPAR]; // Gaussian means double sd[MAXPAR]; // Gaussian standard deviations - Constraint *constr[MAXPAR]; // controls constrainedVal entries + te_expr *constr[MAXPAR]; // controls constrainedVal entries + te_variable te_pars[MAXPAR]; + char *formulas[MAXPAR]; // formulas of constrained vars }; -static int compareDblPtrs(const void *void_x, const void *void_y); -static int compareDbls(const void *void_x, const void *void_y); static inline int chrcount(const char *s, char c); -Term *Term_new(Term *head, ParKeyVal *pkv, char *str); -Term *Term_dup(Term *old, ParKeyVal *pkv); -void Term_free(Term *self); -double Term_value(Term *self); -void Term_prFormula(Term *self, FILE *fp); -int Term_equals(Term *lhs, Term *rhs); /// Count the number of copies of character c in string s static inline int chrcount(const char *s, char c) { @@ -91,14 +81,14 @@ static inline int chrcount(const char *s, char c) { } /// Return <0, 0, or >0, as x is <, ==, or > y. -static int compareDblPtrs(const void *void_x, const void *void_y) { +int compareDblPtrs(const void *void_x, const void *void_y) { double * const * x = (double * const *) void_x; double * const * y = (double * const *) void_y; return **x - **y; } /// Return <0, 0, or >0, as x is <, ==, or > y. -static int compareDbls(const void *void_x, const void *void_y) { +int compareDbls(const void *void_x, const void *void_y) { const double * x = (const double *) void_x; const double * y = (const double *) void_y; return *x - *y; @@ -143,14 +133,14 @@ void ParStore_printConstrained(ParStore *self, FILE *fp) { int i; fprintf(fp, "%5d constrained:\n", self->nConstrained); for(i=0; i < self->nConstrained; ++i) { - fprintf(fp, " %8s = %lg = ", self->nameConstrained[i], - self->constrainedVal[i]); - Constraint_prFormula(self->constr[i], fp); + fprintf(fp, " %8s = %lg = %s\n", self->nameConstrained[i], + self->constrainedVal[i], + self->formulas[i]); } } /// Constructor -ParStore *ParStore_new(void) { +ParStore *ParStore_new(void) { ParStore *self = malloc(sizeof(ParStore)); CHECKMEM(self); memset(self, 0, sizeof(ParStore)); @@ -159,8 +149,9 @@ ParStore *ParStore_new(void) { } /// Duplicate a ParStore -ParStore *ParStore_dup(const ParStore * old) { +ParStore *ParStore_dup(const ParStore * old) { assert(old); + int status; ParStore *new = memdup(old, sizeof(ParStore)); new->pkv = NULL; @@ -169,6 +160,8 @@ ParStore *ParStore_dup(const ParStore * old) { new->nameFree[i] = strdup(old->nameFree[i]); new->pkv = ParKeyVal_add(new->pkv, new->nameFree[i], new->freeVal + i, Free); + new->te_pars[i].name = new->nameFree[i]; + new->te_pars[i].address = new->freeVal + i; } for(i = 0; i < new->nFixed; ++i) { @@ -187,7 +180,15 @@ ParStore *ParStore_dup(const ParStore * old) { new->nameConstrained[i] = strdup(old->nameConstrained[i]); new->pkv = ParKeyVal_add(new->pkv, new->nameConstrained[i], new->constrainedVal + i, Constrained); - new->constr[i] = Constraint_dup(old->constr[i], new->pkv); + new->formulas[i] = strdup(old->formulas[i]); + new->constr[i] = te_compile(new->formulas[i], new->te_pars, + new->nFree, &status); + if(new->constr[i] == NULL) { + fprintf(stderr,"%s:%d: parse error\n", __FILE__,__LINE__); + fprintf(stderr," %s\n", new->formulas[i]); + fprintf(stderr," %*s^\nError near here\n", status-1, ""); + exit(EXIT_FAILURE); + } } ParStore_sanityCheck(new, __FILE__, __LINE__); return new; @@ -208,7 +209,8 @@ void ParStore_free(ParStore * self) { for(i=0; i < self->nConstrained; ++i) { free(self->nameConstrained[i]); - Constraint_free(self->constr[i]); + free(self->formulas[i]); + te_free(self->constr[i]); } ParKeyVal_free(self->pkv); @@ -245,6 +247,8 @@ void ParStore_addFreePar(ParStore * self, double value, self->hiFree[i] = hi; self->nameFree[i] = strdup(name); CHECKMEM(self->nameFree[i]); + self->te_pars[i].name = self->nameFree[i]; + self->te_pars[i].address = self->freeVal + i; // Linked list associates pointer with parameter name. self->pkv = ParKeyVal_add(self->pkv, name, self->freeVal + i, @@ -309,9 +313,9 @@ void ParStore_addFixedPar(ParStore * self, double value, const char *name) { } /// Add constrained parameter to ParStore. -void ParStore_addConstrainedPar(ParStore * self, char *str, +void ParStore_addConstrainedPar(ParStore * self, const char *str, const char *name) { - int i = self->nConstrained; + int status, i = self->nConstrained; ParamStatus pstat; if(NULL != ParKeyVal_get(self->pkv, &pstat, name)) { fprintf(stderr,"%s:%d: Duplicate definition of parameter \"%s\".\n", @@ -327,13 +331,20 @@ void ParStore_addConstrainedPar(ParStore * self, char *str, exit(EXIT_FAILURE); } + self->formulas[i] = strdup(str); self->nameConstrained[i] = strdup(name); CHECKMEM(self->nameConstrained[i]); self->pkv = ParKeyVal_add(self->pkv, name, self->constrainedVal + i, Constrained); - self->constr[i] = Constraint_new(self->pkv, str); - self->constrainedVal[i] = Constraint_getValue(self->constr[i]); + self->constr[i] = te_compile(str, self->te_pars, self->nFree, &status); + if(self->constr[i] == NULL) { + fprintf(stderr,"%s:%d: parse error\n", __FILE__,__LINE__); + fprintf(stderr," %s\n", str); + fprintf(stderr," %*s^\nError near here\n", status-1, ""); + exit(EXIT_FAILURE); + } + SET_CONSTR(i); } /// Return the number of fixed parameters @@ -372,6 +383,13 @@ double ParStore_getFree(ParStore * self, int i) { return self->freeVal[i]; } +/// Get value of i'th constrained parameter +double ParStore_getConstrained(ParStore * self, int i) { + assert(i < self->nConstrained); + SET_CONSTR(i); + return self->constrainedVal[i]; +} + /// Get value of i'th Gaussian parameter double ParStore_getGaussian(ParStore * self, int i) { assert(i < self->nGaussian); @@ -487,51 +505,70 @@ void ParStore_sanityCheck(ParStore *self, const char *file, int line) { } /// Return 1 if two ParStore objects are equal; 0 otherwise. -int ParStore_equals(const ParStore *lhs, const ParStore *rhs) { +int ParStore_equals(ParStore *lhs, ParStore *rhs) { if(lhs == rhs) return 1; - if(lhs->nFixed != rhs->nFixed) + if(lhs->nFixed != rhs->nFixed) { return 0; - if(lhs->nFree != rhs->nFree) + } + if(lhs->nFree != rhs->nFree) { return 0; - if(lhs->nGaussian != rhs->nGaussian) + } + if(lhs->nGaussian != rhs->nGaussian) { return 0; - if(lhs->nConstrained != rhs->nConstrained) + } + if(lhs->nConstrained != rhs->nConstrained) { return 0; + } if(0 != memcmp(lhs->fixedVal, rhs->fixedVal, - lhs->nFixed*sizeof(lhs->fixedVal[0]))) + lhs->nFixed*sizeof(lhs->fixedVal[0]))) { return 0; + } if(0 != memcmp(lhs->freeVal, rhs->freeVal, - lhs->nFree*sizeof(lhs->freeVal[0]))) + lhs->nFree*sizeof(lhs->freeVal[0]))) { return 0; + } + if(ParStore_constrain(lhs)) { + fprintf(stderr,"%s:%d: free parameters violate constraints\n", + __FILE__,__LINE__); + } + if(ParStore_constrain(rhs)) { + fprintf(stderr,"%s:%d: free parameters violate constraints\n", + __FILE__,__LINE__); + } if(0 != memcmp(lhs->constrainedVal, rhs->constrainedVal, - lhs->nConstrained*sizeof(lhs->constrainedVal[0]))) + lhs->nConstrained*sizeof(lhs->constrainedVal[0]))) { return 0; + } if(0 != memcmp(lhs->loFree, rhs->loFree, - lhs->nFree*sizeof(lhs->loFree[0]))) + lhs->nFree*sizeof(lhs->loFree[0]))) { return 0; + } if(0 != memcmp(lhs->hiFree, rhs->hiFree, - lhs->nFree*sizeof(lhs->hiFree[0]))) + lhs->nFree*sizeof(lhs->hiFree[0]))) { return 0; + } if(0 != memcmp(lhs->mean, rhs->mean, - lhs->nGaussian*sizeof(lhs->mean[0]))) + lhs->nGaussian*sizeof(lhs->mean[0]))) { return 0; + } if(0 != memcmp(lhs->sd, rhs->sd, - lhs->nGaussian*sizeof(lhs->sd[0]))) + lhs->nGaussian*sizeof(lhs->sd[0]))) { return 0; + } int i; for(i=0; i < lhs->nFixed; ++i) - if(0 != strcmp(lhs->nameFixed[i], rhs->nameFixed[i])) + if(0 != strcmp(lhs->nameFixed[i], rhs->nameFixed[i])){ return 0; + } for(i=0; i < lhs->nFree; ++i) - if(0 != strcmp(lhs->nameFree[i], rhs->nameFree[i])) + if(0 != strcmp(lhs->nameFree[i], rhs->nameFree[i])) { return 0; + } for(i=0; i < lhs->nGaussian; ++i) - if(0 != strcmp(lhs->nameGaussian[i], rhs->nameGaussian[i])) - return 0; - for(i=0; i < lhs->nConstrained; ++i) - if(!Constraint_equals(lhs->constr[i], rhs->constr[i])) + if(0 != strcmp(lhs->nameGaussian[i], rhs->nameGaussian[i])) { return 0; + } return ParKeyVal_equals(lhs->pkv, rhs->pkv); } @@ -573,15 +610,24 @@ void ParStore_constrain_ptr(ParStore *self, double *ptr) { assert(i < self->nConstrained); // set value of constrained parameter - self->constrainedVal[i] = Constraint_getValue(self->constr[i]); + SET_CONSTR(i); assert(*ptr == self->constrainedVal[i]); } -/// Set values of all constrained parameters -void ParStore_constrain(ParStore *self) { +/// First check to see that free parameters obey boundary constraints. +/// If not, then return 1. Otherwise, set values of all constrained +/// parameters and return 0. +int ParStore_constrain(ParStore *self) { int i; - for(i=0; i < self->nConstrained; ++i) - self->constrainedVal[i] = Constraint_getValue(self->constr[i]); + for(i=0; i < self->nFree; ++i) { + if(self->freeVal[i] < self->loFree[i] + || self->freeVal[i] > self->hiFree[i]) + return 1; + } + for(i=0; i < self->nConstrained; ++i) { + SET_CONSTR(i); + } + return 0; } /// Make sure Bounds object is sane. @@ -604,308 +650,3 @@ int Bounds_equals(const Bounds *lhs, const Bounds *rhs) { && lhs->lo_t == rhs->lo_t && lhs->hi_t == rhs->hi_t; } - -Constraint *Constraint_new(ParKeyVal *pkv, char *str) { - Constraint *self = malloc(sizeof(Constraint)); - CHECKMEM(self); - - char *token, *next = str; - - // Parse y intercept - token = strsep(&next, "+"); - if(token==NULL) { - fprintf(stderr,"%s:%d: Bad y intercept in constraint:" - " \"%s\"\n", __FILE__,__LINE__,next); - exit(EXIT_FAILURE); - } - if(strchr(token, '*')) { - fprintf(stderr,"%s:%s:%d:" - " 1st term of formula (%s) is illegal.\n" - " Should be an additive constant.\n", - __FILE__, __func__, __LINE__, token); - exit(EXIT_FAILURE); - } - errno=0; - self->a = parseDbl(token); - if(errno) { - char err_buff[50]; - strerror_r(errno, err_buff, sizeof(err_buff)); - fprintf(stderr,"%s:%d: Bad float \"%s\" (%s).\n", - __FILE__,__LINE__, token, err_buff); - exit(EXIT_FAILURE); - } - - if(next==NULL) { - fprintf(stderr,"%s:%s:%d: Formula must have at least 2 terms.\n" - " Got \"%s\".\n", - __FILE__,__func__,__LINE__, token); - exit(EXIT_FAILURE); - } - - // Parse terms of regression equation - self->term = NULL; - while(next) { - token = strsep(&next, "+"); - if(token==NULL) { - fprintf(stderr,"%s:%d: Bad regression term:" - " \"%s\"\n", __FILE__,__LINE__,next); - exit(EXIT_FAILURE); - } - self->term = Term_new(self->term, pkv, token); - } - - return self; -} - -Constraint *Constraint_free(Constraint *self) { - assert(self != NULL); - Term_free(self->term); - free(self); - return NULL; -} - -/// Return the current value of the constrained variable -double Constraint_getValue(Constraint *self) { - double y = self->a; - y += Term_value(self->term); - return y; -} - -void Constraint_prFormula(Constraint *self, FILE *fp) { - assert(self != NULL); - fprintf(fp, "%lg", self->a); - Term_prFormula(self->term, fp); - putc('\n', fp); -} - -Constraint *Constraint_dup(Constraint *old, ParKeyVal *pkv) { - Constraint *new = malloc(sizeof(Constraint)); - CHECKMEM(new); - new->a = old->a; - new->term = Term_dup(old->term, pkv); - return new; -} - -int Constraint_equals(Constraint *lhs, Constraint *rhs) { - if(lhs->a != rhs->a) { - fprintf(stderr,"%s:%s:%d\n", __FILE__,__func__,__LINE__); - return 0; - } - return Term_equals(lhs->term, rhs->term); -} - -/// Allocate a new term and initialize is using -/// str and pkv. -/// str should look like 1.23*name_1*name_2*...*name_n -Term *Term_new(Term *head, ParKeyVal *pkv, char *str) { - assert(str != NULL); - assert(strlen(str) > 0); - - Term *self = malloc(sizeof(Term)); - CHECKMEM(self); - - char *s, *token, *next = str; - - // parse beta, the coefficient of current term - token = strsep(&next, "*"); - if(token==NULL) { - fprintf(stderr,"%s:%d: Bad term \"%s\"\n", __FILE__,__LINE__,str); - exit(EXIT_FAILURE); - } - errno = 0; - self->b = parseDbl(token); - if(errno) { - char err_buff[50]; - strerror_r(errno, err_buff, sizeof(err_buff)); - fprintf(stderr,"%s:%d: Bad float \"%s\" (%s).\n", - __FILE__,__LINE__, token, err_buff); - exit(EXIT_FAILURE); - } - - // get dimension and allocate arrays - if(next == NULL) { - fprintf(stderr,"%s:%d: Term (%s) has no variable.\n", - __FILE__,__LINE__,token); - exit(EXIT_FAILURE); - } - self->n = 1 + chrcount(next, '*'); - self->lbl = malloc(self->n * sizeof(self->lbl[0])); - CHECKMEM(self->lbl); - self->x = malloc(self->n * sizeof(self->x[0])); - CHECKMEM(self->x); - - // parse variable names - ParamStatus pstat; - int i; - for(i=0; i < self->n; ++i) { - token = strsep(&next, "*"); - assert(token != NULL); - s = stripWhiteSpace(token); - double *ptr = ParKeyVal_get(pkv, &pstat, s); - if(ptr==NULL || pstat != Free) { - fprintf(stderr,"%s:%d: there is no free parameter \"%s\".\n", - __FILE__, __LINE__, s); - exit(EXIT_FAILURE); - } - self->lbl[i] = strdup(s); - self->x[i] = ptr; - } - - self->next = head; - return self; -} - -Term *Term_dup(Term *old, ParKeyVal *pkv) { - if(old == NULL) - return NULL; - Term *new = malloc(sizeof(Term)); - CHECKMEM(new); - memcpy(new, old, sizeof(Term)); - new->lbl = malloc(new->n * sizeof(new->lbl[0])); - CHECKMEM(new->lbl); - new->x = malloc(new->n * sizeof(new->x[0])); - CHECKMEM(new->x); - int i; - ParamStatus pstat; - for(i=0; i < new->n; ++i) { - new->lbl[i] = strdup(old->lbl[i]); - new->x[i] = ParKeyVal_get(pkv, &pstat, new->lbl[i]); - if(new->x[i] == NULL || pstat!=Free) { - fprintf(stderr,"%s:%d: no free parameter named \"%s\"\n", - __FILE__,__LINE__,new->lbl[i]); - exit(EXIT_FAILURE); - } - } - new->next = Term_dup(old->next, pkv); - return new; -} - -void Term_free(Term *self) { - if(self == NULL) - return; - Term_free(self->next); - int i; - for(i=0; i < self->n; ++i) - free(self->lbl[i]); - free(self->lbl); - free(self->x); - free(self); -} - -/// Calculate sum of polynomial terms. -/// Polynomial is stored as a linked list of terms. -double Term_value(Term *self) { - if(self==NULL) - return 0.0; - int i; - double v = self->b; - for(i=0; i < self->n; ++i) - v *= *self->x[i]; - return v + Term_value(self->next); -} - -void Term_prFormula(Term *self, FILE *fp) { - if(self==NULL) - return; - Term_prFormula(self->next, fp); - fprintf(fp, " + %lg", self->b); - int i; - for(i=0; i < self->n; ++i) - fprintf(fp, "*%s", self->lbl[i]); -} - -int Term_equals(Term *lhs, Term *rhs) { - if(lhs==NULL && rhs==NULL) - return 1; - if(lhs==NULL || rhs==NULL) { - return 0; - } - if(lhs->b != rhs->b) { - return 0; - } - if(lhs->n != rhs->n) { - return 0; - } - int i; - for(i=0; i < lhs->n; ++i) { - if(0 != strcmp(lhs->lbl[i], rhs->lbl[i])) { - return 0; - } - } - return Term_equals(lhs->next, rhs->next); -} - -#ifdef TEST - -#ifdef NDEBUG -#error "Unit tests must be compiled without -DNDEBUG flag" -#endif - -int main(int argc, char **argv) { - - int verbose=0; - - if(argc > 1) { - if(argc!=2 || 0!=strcmp(argv[1], "-v")) { - fprintf(stderr,"usage: xterm [-v]\n"); - exit(EXIT_FAILURE); - } - verbose = 1; - } - - double x=1.0, y=1.0, z=2.0; - double *px=&x, *py=&y, *pz=&z; - double **ppx=&px, **ppy=&py, **ppz=&pz; - - assert(0 == compareDbls(px, py)); - assert(0 == compareDbls(px, px)); - assert(0 > compareDbls(px, pz)); - assert(0 < compareDbls(pz, py)); - - unitTstResult("compareDbls", "OK"); - - assert(0 == compareDblPtrs(ppx, ppy)); - assert(0 == compareDblPtrs(ppx, ppx)); - assert(0 > compareDblPtrs(ppx, ppz)); - assert(0 < compareDblPtrs(ppz, ppy)); - - unitTstResult("compareDblPtrs", "OK"); - - ParKeyVal *pkv = NULL; - pkv = ParKeyVal_add(pkv, "x", &x, Free); - pkv = ParKeyVal_add(pkv, "y", &y, Free); - pkv = ParKeyVal_add(pkv, "z", &z, Free); - - Term *term = NULL; - char buff[50]; - strcpy(buff, "3*x"); - term = Term_new(term, pkv, buff); - assert(3*x == Term_value(term)); - strcpy(buff, " 1 * x *x"); - term = Term_new(term, pkv, buff); - assert(3*x + x*x == Term_value(term)); - strcpy(buff, " 2* z*z*x"); - term = Term_new(term, pkv, buff); - assert(3*x + x*x + 2*z*z*x == Term_value(term)); - if(verbose) { - printf("Value=%lf\n", Term_value(term)); - printf("Formula:"); - Term_prFormula(term, stdout); - putchar('\n'); - } - - double xx=1.0, yy=1.0, zz=2.0; - ParKeyVal *pkv2 = NULL; - pkv2 = ParKeyVal_add(pkv2, "x", &xx, Free); - pkv2 = ParKeyVal_add(pkv2, "y", &yy, Free); - pkv2 = ParKeyVal_add(pkv2, "z", &zz, Free); - - Term *term2 = Term_dup(term, pkv2); - assert(Term_equals(term, term2)); - Term_free(term2); - Term_free(term); - ParKeyVal_free(pkv); - ParKeyVal_free(pkv2); - unitTstResult("Term", "OK"); -} -#endif diff --git a/src/parstore.h b/src/parstore.h index 5ac380fc..0720091f 100644 --- a/src/parstore.h +++ b/src/parstore.h @@ -19,9 +19,9 @@ void ParStore_addGaussianPar(ParStore * self, double mean, double sd, const char *name); void ParStore_addFixedPar(ParStore * self, double value, const char *name); -void ParStore_addConstrainedPar(ParStore * self, char *str, +void ParStore_addConstrainedPar(ParStore * self, const char *str, const char *name); -void ParStore_constrain(ParStore *self); +int ParStore_constrain(ParStore *self); void ParStore_constrain_ptr(ParStore *self, double *ptr); int ParStore_nFixed(ParStore * self); int ParStore_nFree(ParStore * self); @@ -31,6 +31,7 @@ int ParStore_nConstrained(ParStore * self); double ParStore_getFixed(ParStore * self, int i); double ParStore_getFree(ParStore * self, int i); double ParStore_getGaussian(ParStore * self, int i); +double ParStore_getConstrained(ParStore * self, int i); const char *ParStore_getNameFixed(ParStore * self, int i); const char *ParStore_getNameFree(ParStore * self, int i); @@ -48,7 +49,7 @@ void ParStore_sanityCheck(ParStore * self, const char *file, int line); void ParStore_print(ParStore * self, FILE * fp); void ParStore_printFree(ParStore * self, FILE * fp); void ParStore_printConstrained(ParStore * self, FILE * fp); -int ParStore_equals(const ParStore * lhs, const ParStore * rhs); +int ParStore_equals(ParStore * lhs, ParStore * rhs); void ParStore_setFreeParams(ParStore * self, int n, double x[n]); void ParStore_getFreeParams(ParStore * self, int n, double x[n]); void ParStore_sample(ParStore * self, double *ptr, double low, @@ -57,11 +58,8 @@ void ParStore_sample(ParStore * self, double *ptr, double low, void Bounds_sanityCheck(Bounds * self, const char *file, int line); int Bounds_equals(const Bounds * lhs, const Bounds * rhs); -Constraint *Constraint_new(ParKeyVal * pkv, char *str); -Constraint *Constraint_free(Constraint * self); -double Constraint_getValue(Constraint * self); -void Constraint_prFormula(Constraint * self, FILE * fp); -Constraint *Constraint_dup(Constraint * old, ParKeyVal * pkv); -int Constraint_equals(Constraint * lhs, Constraint * rhs); +int compareDblPtrs(const void *void_x, const void *void_y); +int compareDbls(const void *void_x, const void *void_y); + #endif diff --git a/src/popnode.c b/src/popnode.c index 8ce1b0b5..00af296b 100644 --- a/src/popnode.c +++ b/src/popnode.c @@ -366,6 +366,14 @@ Gene *PopNode_coalesce(PopNode * self, gsl_rng * rng) { double x; double end = (NULL == self->end ? HUGE_VAL : *self->end); + // Make sure interval is sane. + if(isnan(end)) { + fprintf(stderr,"%s:%d: end of interval is NaN.\n", + __FILE__,__LINE__); + PopNode_printShallow(self, stderr); + exit(1); + } + if(self->child[0]) (void) PopNode_coalesce(self->child[0], rng); if(self->child[1]) @@ -569,7 +577,10 @@ static void PopNode_randomize_r(PopNode * self, Bounds bnd, void PopNode_gaussian(PopNode * self, Bounds bnd, ParStore * ps, gsl_rng * rng) { PopNode_untouch(self); - ParStore_constrain(ps); + if(ParStore_constrain(ps)) { + fprintf(stderr,"%s:%d: free parameters violate constraints\n", + __FILE__,__LINE__); + } PopNode_gaussian_r(self, bnd, ps, rng); } diff --git a/src/raf.c b/src/raf.c new file mode 100644 index 00000000..22e7543a --- /dev/null +++ b/src/raf.c @@ -0,0 +1,279 @@ +/** +@file raf.c +@page raf +@brief Calculate reference allele frequency, raf. + +Input file should consist of tab-separated columns: + +1. chromosome +2. position +3. reference allele +4. alternate alleles +5. genotype in format "0/1" or "0|1", where 0 represents +a copy of the reference allele and 1 a copy of the derived +allele. +6. etc for as many columns as there are genotypes. + +This can be generated from a vcf file as follows: + + bcftools query -f '%CHROM\t%POS\t%REF\t%ALT[\t%GT]\n' fname.vcf.gz + +Output is in 5 columns, separated by tabs: + +1. chromosome +2. position of the nucleotide +3. ref, the reference allele +4. alt, the alternate allele +5. raf, reference allele frequency + +The input files should include all sites at which derived alleles are +present in any of the populations under study. For example, consider +an analysis involving modern humans and Neanderthals. The modern human +data must include all sites at which Neanderthals carry derived +alleles, even if these sites do not vary among modern humans. To +accomplish this, it is best to use whole-genome data for all +populations. + +The input should not contain duplicate nucleotide sites, the +chromosomes should be sorted in lexical order, and within each +chromosome, the nucleotides should be in numerical order. Otherwise, +raf will abort with an error. + +Sites are rejected unless they have a single ref. Missing values are +allowed for the alt allele. At the end of the job a summary of +rejected sites is written to stderr. + +@copyright Copyright (c) 2017, Alan R. Rogers +. This file is released under the Internet +Systems Consortium License, which can be found in file "LICENSE". +*/ + +#include "misc.h" +#include +#include +#include +#include + +int main(int argc, char **argv) { + + if(argc != 1) { + fprintf(stderr, "Usage: raf\n"); + fprintf(stderr, " Reads standard input;" + " writes to standard output.\n"); + exit(EXIT_FAILURE); + } + + const int buffsize = 16384; + char buff[buffsize]; + + // Keep track of the number of sites at which the number + // of reference or alternate alleles differs from 0 + long int zeroref = 0, zeroalt = 0, zerogtype = 0; + long int missref = 0; + long int multref = 0, multalt = 0; + long int nbad = 0, ngood = 0; + int ok; // is current line acceptable + long unsigned lastnucpos = 0, nucpos; + char lastchr[100] = { '\0' }; + + printf("#%s\t%s\t%s\t%s\t%s\n", "chr", "pos", "ref", "alt", "raf"); + while(1) { + if(NULL == fgets(buff, buffsize, stdin)) { + break; + } + if(NULL == strchr(buff, '\n') && !feof(stdin)) { + fprintf(stderr, "%s:%d: Buffer overflow. size=%zu\n", + __FILE__, __LINE__, sizeof(buff)); + exit(EXIT_FAILURE); + } + char *chr, *pos, *reftoken, *alttoken; + char *gtype, *next = buff; + + chr = strsep(&next, "\t"); // field 0 + pos = strsep(&next, "\t"); // field 1 + reftoken = strsep(&next, "\t"); // field 2 + alttoken = strsep(&next, "\t"); // field 3 + + nucpos = strtoul(pos, NULL, 10); + + // lowercase alleles + strlowercase(reftoken); + strlowercase(alttoken); + + // strip space characters + (void) stripchr(chr, ' '); + (void) stripchr(pos, ' '); + (void) stripchr(reftoken, ' '); + (void) stripchr(alttoken, ' '); + + // Check sort of chromosomes + if(*lastchr) { + int diff = strcmp(lastchr, chr); + if(diff > 0) { + // bad sort + fprintf(stderr, "%s:%d: unsorted chromosomes\n", + __FILE__, __LINE__); + fprintf(stderr, " %s > %s\n", lastchr, chr); + exit(1); + } else if(diff < 0) { + // new chromosome + int status = + snprintf(lastchr, sizeof lastchr, "%s", chr); + if(status >= sizeof lastchr) { + fprintf(stderr, "%s:%d: buffer overflow\n", + __FILE__, __LINE__); + exit(1); + } + lastnucpos = 0; + } + } else { + // initialize lastchr + int status = snprintf(lastchr, sizeof lastchr, "%s", chr); + if(status >= sizeof lastchr) { + fprintf(stderr, "%s:%d: buffer overflow\n", + __FILE__, __LINE__); + exit(1); + } + assert(lastnucpos == 0); + } + + // Check sort of nucleotide positions + if(lastnucpos) { + if(lastnucpos == nucpos) { + fprintf(stderr, "%s:%d: Duplicate: chr=%s pos=%lu\n", + __FILE__, __LINE__, chr, nucpos); + fprintf(stderr, "%s:%d: Previous : chr=%s pos=%lu\n", + __FILE__, __LINE__, lastchr, lastnucpos); + exit(1); + } else if(lastnucpos > nucpos) { + fprintf(stderr, "%s:%d: Missorted nucleotide positions\n", + __FILE__, __LINE__); + fprintf(stderr, " Current : chr=%s pos=%lu\n", chr, nucpos); + fprintf(stderr, " Previous: chr=%s pos=%lu\n", + lastchr, lastnucpos); + exit(1); + } + } + lastnucpos = nucpos; + + int maxalleles = 10, nref, nalt; + char *ref[maxalleles]; + char *alt[maxalleles]; + + // Replace '-' with '.' in allele tokens, so there is only + // one kind of missing value. + strReplaceChr(reftoken, '-', '.'); + strReplaceChr(alttoken, '-', '.'); + + // REF and ALT alleles are separated by commas. + nref = tokenize(maxalleles, ref, reftoken, ","); + nalt = tokenize(maxalleles, alt, alttoken, ","); + + // Skip sites at which the number of reference or alternate + // alleles differs from 1. + ok = 1; + if(nref == 0) { + ++zeroref; + ok = 0; + } + if(nalt == 0) { + ++zeroalt; + ok = 0; + } + if(nref > 1) { + ++multref; + ok = 0; + } + if(nalt > 1) { + ++multalt; + ok = 0; + } + if(!ok) { + ++nbad; + continue; + } + // Skip if ref is missing. + if(0==strcmp(ref[0], ".")) { + ++missref; + ok = 0; + } + + if(!ok) { + ++nbad; + continue; + } + + int x = 0, n = 0; + gtype = strsep(&next, "\t"); // field 4 + + while(gtype != NULL) { + //# gtype is a string like "0|1" or "0/1". + switch (gtype[0]) { + case '.': + break; + case '0': + ++x; + ++n; + break; + case '1': + ++n; + break; + default: + fprintf(stderr, "%s:%d: Bad genotype: %s\n", + __FILE__, __LINE__, gtype); + fprintf(stderr, + " chr=%s pos=%s ref=%s alt=%s gtype=%s\n", + chr, pos, ref[0], alt[0], gtype); + exit(EXIT_FAILURE); + } + + switch (gtype[2]) { + case '.': + break; + case '0': + ++x; + ++n; + break; + case '1': + ++n; + break; + default: + fprintf(stderr, "%s:%d: Bad genotype: %s\n", + __FILE__, __LINE__, gtype); + fprintf(stderr, + " chr=%s pos=%s ref=%s alt=%s gtype=%s\n", + chr, pos, ref[0], alt[0], gtype); + exit(EXIT_FAILURE); + } + gtype = strsep(&next, "\t"); // additional fields + } + + if(n == 0) { + ++zerogtype; + ++nbad; + continue; + } else + ++ngood; + + double p = x / ((double) n); + printf("%s\t%s\t%s\t%s\t%0.18g\n", + chr, pos, ref[0], alt[0], p); + } + fprintf(stderr, "raf: %ld good sites; %ld rejected\n", ngood, nbad); + if(zeroref) + fprintf(stderr, "raf: bad sites with 0 ref alleles: %ld\n", zeroref); + if(zeroalt) + fprintf(stderr, "raf: bad sites with 0 alt alleles: %ld\n", zeroalt); + if(zerogtype) + fprintf(stderr, "raf: bad sites with 0 genotypes: %ld\n", zerogtype); + if(multref) + fprintf(stderr, "raf: bad sites with multiple ref alleles: %ld\n", + multref); + if(multalt) + fprintf(stderr, "raf: bad sites with multiple alt alleles: %ld\n", + multalt); + if(missref) + fprintf(stderr, "raf: bad sites with missing ref alleles: %ld\n", + missref); + return 0; +} diff --git a/src/rafreader.c b/src/rafreader.c new file mode 100644 index 00000000..5757a851 --- /dev/null +++ b/src/rafreader.c @@ -0,0 +1,373 @@ +/** + @file rafreader.c + @brief Class RAFReader: read a raf file. + + @copyright Copyright (c) 2016, Alan R. Rogers + . This file is released under the Internet + Systems Consortium License, which can be found in file "LICENSE". +*/ +#include "rafreader.h" +#include "tokenizer.h" +#include "misc.h" +#include "error.h" +#include +#include +#include +#include +#include + +#define MAXFIELDS 5 + +int iscomment(const char *s); + +/// RAFReader constructor +RAFReader *RAFReader_new(const char *fname) { + RAFReader *self = malloc(sizeof(*self)); + CHECKMEM(self); + memset(self, 0, sizeof(RAFReader)); + self->fname = strdup(fname); + CHECKMEM(self->fname); + self->fp = fopen(self->fname, "r"); + if(self->fp == NULL) { + fprintf(stderr, "%s:%s:%d: can't open \"%s\" for input.\n", + __FILE__, __func__, __LINE__, self->fname); + exit(EXIT_FAILURE); + } + self->tkz = Tokenizer_new(MAXFIELDS); + self->snpid = -1; + self->raf = strtod("NaN", NULL); + self->daf = strtod("NaN", NULL); + return self; +} + +/// Clear all chromosome names +void RAFReader_clearChromosomes(int n, RAFReader * r[n]) { + int i; + for(i = 0; i < n; ++i) + r[i]->chr[0] = '\0'; +} + +/// RAFReader destructor +void RAFReader_free(RAFReader * self) { + fclose(self->fp); + free(self->fname); + Tokenizer_free(self->tkz); + free(self); +} + +/// Return 1 if first non-white character in string is '#'; 0 +/// otherwise. +int iscomment(const char *s) { + int rval; + while(*s != '\0' && isspace(*s)) + ++s; + rval = (*s == '#'); + return rval; +} + +/// Read the next site and set derived allele frequency (daf) within +/// each reader. +/// @return 0 on success, or else EOF, NO_ANCESTRAL_ALLELE, +/// BUFFER_OVERFLOW, BAD_RAF_INPUT, BAD_SORT, or an errno code for +/// failure to parse a floating-point number. +int RAFReader_next(RAFReader * self) { + int ntokens; + int status; + char buff[100]; + long unsigned prevnucpos = 0UL; + + // Find a line of input + while(1) { + if(fgets(buff, sizeof(buff), self->fp) == NULL) + return EOF; + if(NULL == strchr(buff, '\n') && !feof(self->fp)) { +#ifdef NDEBUG + fprintf(stderr, "%s:%d: Buffer overflow. size=%zu\n", + __FILE__, __LINE__, sizeof(buff)); +#endif + return BUFFER_OVERFLOW; + } + if(iscomment(buff)) + continue; + Tokenizer_split(self->tkz, buff, "\t"); + ntokens = Tokenizer_strip(self->tkz, " \n"); + if(ntokens > 0) + break; + } + + if(ntokens != 5) { +#ifdef NDEBUG + fprintf(stderr, "%s:%d: Each line of .raf file must have 5 tokens," + " but current line has %d.\n", __FILE__, __LINE__, ntokens); +#endif + return BAD_RAF_INPUT; + } + + ++self->snpid; + + // Chromosome + char prev[RAFSTRSIZE]; + assert(sizeof prev == sizeof self->chr); + memcpy(prev, self->chr, sizeof prev); + status = snprintf(self->chr, sizeof self->chr, "%s", + Tokenizer_token(self->tkz, 0)); + if(status >= sizeof self->chr) { +#ifdef NDEBUG + fprintf(stderr, "%s:%d: chromosome name too long: %s\n", + __FILE__, __LINE__, Tokenizer_token(self->tkz, 0)); +#endif + return BUFFER_OVERFLOW; + } + int diff = strcmp(prev, self->chr); + if(diff > 0) { +#ifdef NDEBUG + fprintf(stderr, "%s:%s:%d: Chromosomes missorted in input.\n", + __FILE__, __func__, __LINE__); + fprintf(stderr, " \"%s\" precedes \"%s\".\n", + prev, self->chr); + Tokenizer_print(self->tkz, stderr); +#endif + return BAD_SORT; + } else if(diff < 0) { + // new chromosome + prevnucpos = 0UL; + } else + prevnucpos = self->nucpos; + + // Nucleotide position + self->nucpos = strtoul(Tokenizer_token(self->tkz, 1), NULL, 10); + if(prevnucpos == self->nucpos) { +#ifdef NDEBUG + fprintf(stderr, "%s:%d: Duplicate line in raf file. chr=%s pos=%lu\n", + __FILE__, __LINE__, self->chr, self->nucpos); +#endif + return BAD_SORT; + } else if(prevnucpos > self->nucpos) { +#ifdef NDEBUG + fprintf(stderr, "%s:%d: positions missorted chr=%s " + "prev=%lu curr=%lu\n", + __FILE__, __LINE__, self->chr, prevnucpos, self->nucpos); +#endif + return BAD_SORT; + } + // Reference allele + status = snprintf(self->ref, sizeof(self->ref), "%s", + Tokenizer_token(self->tkz, 2)); + strlowercase(self->ref); + + // Alternate allele + snprintf(self->alt, sizeof(self->alt), "%s", + Tokenizer_token(self->tkz, 3)); + strlowercase(self->alt); + + // Reference allele frequency + char *token, *end; + token = Tokenizer_token(self->tkz, 4); + errno = 0; + self->raf = strtod(token, &end); + if(end == token) + errno = EINVAL; + if(errno) { + char err_buff[50]; + strerror_r(errno, err_buff, sizeof(err_buff)); +#ifdef NDEBUG + fprintf(stderr, "%s:%d: Bad float \"%s\" (%s); chr=%s pos=%lu\n", + __FILE__, __LINE__, token, err_buff, self->chr, self->nucpos); +#endif + return errno; + } + + return 0; +} + +/// Rewind raf file. +/// @return 0 on success; -1 on failure +int RAFReader_rewind(RAFReader * self) { + return fseek(self->fp, 0L, SEEK_SET); +} + +#define MAX(X,Y) ((X) > (Y) ? (X) : (Y)) +#define MIN(X,Y) ((X) > (Y) ? (Y) : (X)) + +/// Advance an array of RAFReaders to the next shared position, and +/// set derived allele frequency within each RAFReader. +/// @param[in] n number of RAFReader objects in array +/// @param[in] r array of RAFReader objects. Last one should be outgroup. +/// @return 0 on success, or one of several error codes on failure. +int RAFReader_multiNext(int n, RAFReader * r[n]) { + int i, status; + unsigned long maxnuc = 0, minnuc = ULONG_MAX; + int imaxchr; // index of reader with maximum chromosome position + int onSameChr; // indicates whether all readers are on same chromosome. + int diff; + char currchr[RAFSTRSIZE] = { '\0' }; // current chromosome + + // Set index, imaxchr, of reader with maximum + // chromosome values in lexical sort order, and + // set boolean flag, onSameChr, which indicates + // whether all readers are on same chromosome. + if((status = RAFReader_next(r[0]))) + return status; + + imaxchr = 0; + onSameChr = 1; + for(i = 1; i < n; ++i) { + if((status = RAFReader_next(r[i]))) + return status; + + diff = strcmp(r[i]->chr, r[imaxchr]->chr); + if(diff > 0) { + onSameChr = 0; + imaxchr = i; + } else if(diff < 0) + onSameChr = 0; + } + + // Loop until both chr and position are homogeneous. + do { + // get them all on the same chromosome + while(!onSameChr) { + onSameChr = 1; + for(i = 0; i < n; ++i) { + if(i == imaxchr) + continue; + while((diff = strcmp(r[i]->chr, r[imaxchr]->chr)) < 0) { + if((status = RAFReader_next(r[i]))) + return status; + } + assert(diff >= 0); + if(diff > 0) { + imaxchr = i; + onSameChr = 0; + } + } + } + + assert(onSameChr); + maxnuc = minnuc = r[0]->nucpos; + for(i = 1; i < n; ++i) { + maxnuc = MAX(maxnuc, r[i]->nucpos); + minnuc = MIN(minnuc, r[i]->nucpos); + } + + // currchr records current chromosome + status = snprintf(currchr, sizeof currchr, "%s", r[0]->chr); + if(status >= sizeof currchr) { + fprintf(stderr, "%s:%d: buffer overflow\n", __FILE__, __LINE__); + return BUFFER_OVERFLOW; + } + // Now get them all on the same position. Have to keep + // checking chr in case one file moves to another chromosome. + for(i = 0; onSameChr && i < n; ++i) { + // Increment each reader so long as we're all on the same + // chromosome and the reader's nucpos is low. + while(onSameChr && r[i]->nucpos < maxnuc) { + if((status = RAFReader_next(r[i]))) + return status; + diff = strcmp(r[i]->chr, currchr); + if(diff != 0) { + // Assertion should succeed because RAFReader_next + // guarantees that chromosomes are in sort order. + assert(diff > 0); + onSameChr = 0; + imaxchr = i; + } + } + } + } while(!onSameChr || minnuc != maxnuc); + + // Make sure reference allele isn't fixed in readers, excluding + // the outgroup (reader n-1). If it's fixed, then we can't call + // the ancestral allele. + double maxp, minp; + maxp = minp = RAFReader_raf(r[0]); + for(i = 1; i < n - 1; ++i) { + double p = RAFReader_raf(r[i]); // reference allele freq + minp = fmin(minp, p); + maxp = fmax(maxp, p); + } + if(maxp == 0.0 || minp == 1.0) + return NO_ANCESTRAL_ALLELE; + + // Make sure REF and ALT are consistent across readers + if((status = RAFReader_alleleCheck(n, r))) + return status; + + // Set derived allele frequency + double ogf = r[n - 1]->raf; // freq of ref in outgroup + if(ogf == 0.0) { + // reference allele is derived + for(i = 0; i < n; ++i) + r[i]->daf = r[i]->raf; + } else if(ogf == 1.0) { + // alternate allele is derived + for(i = 0; i < n; ++i) + r[i]->daf = 1.0 - r[i]->raf; + } else { + // outgroup is polymorphic: can't call ancestral allele + return NO_ANCESTRAL_ALLELE; + } + + return 0; +} + +/// Return 0 if ref and alt alleles of all readers match; return +/// REF_MISMATCH if there is a mismatch in REF alleles; return +/// MULTIPLE_ALT if there is a mismatch in ALT alleles. +int RAFReader_alleleCheck(int n, RAFReader * r[n]) { + char *ref = r[0]->ref; + char *alt = r[0]->alt; + int altMissing = (0 == strcmp(".", alt)); + int i; + for(i = 1; i < n; ++i) { + if(0 != strcmp(ref, r[i]->ref)) + return REF_MISMATCH; + int currAltMissing = (0 == strcmp(".", r[i]->alt)); + if(altMissing && !currAltMissing) { + altMissing = 0; + alt = r[i]->alt; + continue; + } + if(!altMissing && !currAltMissing && 0 != strcmp(alt, r[i]->alt)) + return MULTIPLE_ALT; + } + return 0; +} + +/// Print header for raf file. +void RAFReader_printHdr(FILE * fp) { + fprintf(fp, "%30s %5s %10s %3s %3s %8s %8s\n", + "file", "chr", "pos", "ref", "alt", "raf", "daf"); +} + +/// Print current line of raf file +void RAFReader_print(RAFReader * r, FILE * fp) { + assert(r->fname); + int status; + char buff[500]; + status = snprintf(buff, sizeof buff, "%s", r->fname); + if(status >= sizeof buff) { + fprintf(stderr, "%s:%s:%d: buffer overflow\n", + __FILE__, __func__, __LINE__); + exit(EXIT_FAILURE); + } + fprintf(fp, "%30s %5s %10lu %3s %3s %8.6lg %8.6lg\n", + strltrunc(buff, 30), r->chr, r->nucpos, r->ref, r->alt, r->raf, + r->daf); +} + +void RAFReader_printArray(int n, RAFReader * r[n], FILE * fp) { + int i; + for(i = 0; i < n; ++i) + RAFReader_print(r[i], fp); +} + +/// Return reference allele frequency of current line of raf file. +double RAFReader_raf(RAFReader * r) { + return r->raf; +} + +/// Return derived allele frequency of current line of raf file. +double RAFReader_daf(RAFReader * r) { + return r->daf; +} diff --git a/src/rafreader.h b/src/rafreader.h new file mode 100644 index 00000000..ad938d6d --- /dev/null +++ b/src/rafreader.h @@ -0,0 +1,49 @@ +#ifndef RAFREADER_INCLUDED +#define RAFREADER_INCLUDED + +#include "typedefs.h" +#include + +#define RAFSTRSIZE 20 + +struct RAFReader { + char *fname; + FILE *fp; + Tokenizer *tkz; + + // properties of current snp + long snpid; // 0-based index of current snp + char ref[RAFSTRSIZE]; // reference allele + char alt[RAFSTRSIZE]; // alternate alleles + char chr[RAFSTRSIZE]; // chromosome + unsigned long nucpos; // nucleotide position from raf file + double raf; // frequency of reference allele + double daf; // frequency of derived allele +}; + +RAFReader *RAFReader_new(const char *fname); +void RAFReader_clearChromosomes(int n, RAFReader *r[n]); +void RAFReader_free(RAFReader * self); +int RAFReader_next(RAFReader * self); +double RAFReader_raf(RAFReader * r); +double RAFReader_daf(RAFReader * r); +int RAFReader_alleleCheck(int n, RAFReader * r[n]); +void RAFReader_printHdr(FILE *fp); +void RAFReader_print(RAFReader *r, FILE *fp); +void RAFReader_printArray(int n, RAFReader * r[n], FILE *fp); +int RAFReader_rewind(RAFReader *self); +int RAFReader_multiNext(int n, RAFReader * r[n]); +static inline const char *RAFReader_chr(RAFReader *self); +static inline unsigned long RAFReader_nucpos(RAFReader *self); + + +/// Return const pointer to label of current chromosome. +static inline const char *RAFReader_chr(RAFReader *self) { + return self->chr; +} + +/// Return position of current nucleotide site +static inline unsigned long RAFReader_nucpos(RAFReader *self) { + return self->nucpos; +} +#endif diff --git a/src/sitepat.c b/src/sitepat.c new file mode 100644 index 00000000..47542c2e --- /dev/null +++ b/src/sitepat.c @@ -0,0 +1,678 @@ +/** +@file sitepat.c +@page sitepat +@brief Tabulate site pattern frequencies from .raf files. + +# Sitepat: tabulates site patterns + +Sitepat reads data in .raf format and tabulates counts of nucleotide +site patterns, writing the result to standard output. Optionally, it +also calculates a moving-blocks bootstrap, writing each bootstrap +replicate into a separate file. + +# Usage + + Usage: sitepat [options] = = ... + where and are arbitrary labels, and and are input + files in raf format. Writes to standard output. Labels may not include + the character ":". Maximum number of input files: 32. + + Options may include: + -f or --bootfile + Bootstrap output file basename. Def: sitepat.boot. + -r or --bootreps + # of bootstrap replicates. Def: 0 + -b or --blocksize + # of SNPs per block in moving-blocks bootstrap. Def: 0. + -1 or --singletons + Use singleton site patterns + -m or --logMismatch + log AA/DA mismatches to sitepat.log + -A or --logAA + log sites with uncallable ancestral alleles + -a or --logAll + log all sites to sitepat.log + -h or --help + Print this message + +# Example + +Before running `sitepat`, use @ref raf "raf" to convert the input data +into raf format. Let us assume you have done this, and that directory +~/raf contains a separate raf file for each population. We want to +compare 4 populations, whose .raf files are `yri.raf`, `ceu.raf`, +`altai.raf`, and `denisova.raf`. The following command will do this, +putting the results into `obs.txt`. + + sitepat x=~/raf/yri.raf \ + y=~/raf/ceu.raf \ + n=~/raf/altai.raf \ + d=~/raf/denisova.raf > obs.txt + +Here, "x", "y", "n", and "d" are labels that will be used to identify +site patterns in the output. For example, site pattern "x:y" refers to +the pattern in which the derived allele is present haploid samples +from "x" and "y" but not on those from other populations. The order of +the command-line arguments determines the order in which labels are +sorted on output. Given the command line above, we would get a site +pattern labeled "x:y:d" rather than, say, "y:x:d". + +The output looks like this: + + # Population labels: + # x = /home/rogers/raf/yri.raf + # y = /home/rogers/raf/ceu.raf + # n = /home/rogers/raf/altai.raf + # d = /home/rogers/raf/denisova.raf + # Excluding singleton site patterns. + # Number of site patterns: 10 + # Tabulated 12327755 SNPs + # SitePat E[count] + x:y 340952.4592501 + x:n 46874.1307236 + x:d 46034.4670204 + y:n 55137.4236715 + y:d 43535.5248078 + n:d 231953.3372578 + x:y:n 91646.1277991 + x:y:d 88476.9619569 + x:n:d 96676.3877423 + y:n:d 100311.4411513 + +The left column lists the site patterns that occur in the data. The +right column gives the expected count of each site pattern. These are +not integers, because they represent averages over all possible +subsamples consisting of a single haploid genome from each +population. + +In the raf files used as input, chromosomes should appear in lexical +order. Within each chromosome, nucleotides should appear in numerical +order. There should be no duplicate (chromosome, position) +pairs. Otherwise, the program aborts with an error. + +To generate a bootstrap, use the `--bootreps` option: + + sitepat --bootreps 50 \ + x=~/raf/yri.raf \ + y=~/raf/ceu.raf \ + n=~/raf/altai.raf \ + d=~/raf/denisova.raf > obs.txt + +This will generate not only the primary output file, `obs.txt`, but also +50 additional files, each representing a single bootstrap +replicate. The primary output file now has a bootstrap confidence +interval: + + # Population labels: + # x = /home/rogers/raf/yri.raf + # y = /home/rogers/raf/ceu.raf + # n = /home/rogers/raf/altai.raf + # d = /home/rogers/raf/denisova.raf + # Excluding singleton site patterns. + # Number of site patterns: 10 + # Tabulated 12327755 SNPs + # bootstrap output file = sitepat.boot + # confidence level = 95% + # SitePat E[count] loBnd hiBnd + x:y 340952.4592501 338825.6604586 342406.6670816 + x:n 46874.1307236 46361.5798377 47438.1857029 + x:d 46034.4670204 45605.6588012 46631.6434277 + y:n 55137.4236715 54650.0763578 55783.7051253 + y:d 43535.5248078 43110.5119922 44234.0919024 + n:d 231953.3372578 229495.3741057 234173.6878092 + x:y:n 91646.1277991 90494.0219749 92873.4443706 + x:y:d 88476.9619569 87137.1867967 89585.8431419 + x:n:d 96676.3877423 95935.5184294 97417.6241185 + y:n:d 100311.4411513 99292.9839140 101163.3457462 + +Here, `loBnd` and `hiBnd` are the limits of a 95% confidence +interval. The bootstrap output files look like `sitepat.boot000`, +`sitepat.boot001`, and so on. + +@copyright Copyright (c) 2016, Alan R. Rogers +. This file is released under the Internet +Systems Consortium License, which can be found in file "LICENSE". +*/ + +#include "binary.h" +#include "boot.h" +#include "rafreader.h" +#include "misc.h" +#include "strint.h" +#include "error.h" +#include "typedefs.h" +#include "version.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAXCHR 24 // maximum number of chromosomes + +typedef struct Stack Stack; + +/// Treat a vector of tipId_t values as a push-down stack. +struct Stack { + int dim, nused; + tipId_t *buff; // not locally owned +}; + +static void usage(void); +static Stack *Stack_new(int dim, tipId_t buff[dim]); +static void Stack_free(Stack * stk); +static void Stack_push(Stack * self, tipId_t x); +static void generatePatterns(int bit, int npops, Stack * stk, tipId_t pat, + int doSing); + +const char *useMsg = + "\nUsage: sitepat [options] = = ... outgroup=\n" + " where and are arbitrary labels, and are input\n" + " files in raf format. Writes to standard output." + " Labels may not include\n" + " the character \":\". Final label must be \"outgroup\".\n"; + +/// Print usage message and die. +static void usage(void) { + fputs(useMsg, stderr); + fprintf(stderr, " Maximum number of input files: %lu plus outgroup.\n", + 8 * sizeof(tipId_t)); + fputs("\nOptions may include:\n", stderr); + tellopt("-f or --bootfile ", + "Bootstrap output file basename. Def: sitepat.boot."); + tellopt("-r or --bootreps ", "# of bootstrap replicates. Def: 0"); + tellopt("-b or --blocksize ", + "# of SNPs per block in moving-blocks bootstrap. Def: 0."); + tellopt("-1 or --singletons", "Use singleton site patterns"); + tellopt("-m or --logMismatch", "Log REF mismatches to sitepat.log"); + tellopt("-A or --logAA", "Log sites with uncallable ancestral allele"); + tellopt("--version", "Print version and exit"); + tellopt("-h or --help", "Print this message"); + exit(1); +} + +/// This stack is local to this file. It provides a bounds-controlled +/// interface to an external array, which is passed as an argument, buff, +/// to Stack_new. +static Stack *Stack_new(int dim, tipId_t buff[dim]) { + Stack *self = malloc(sizeof(Stack)); + CHECKMEM(self); + self->dim = dim; + self->buff = buff; + self->nused = 0; + return self; +} + +/// Frees the stack but not the underlying buffer. +static void Stack_free(Stack * stk) { + free(stk); +} + +/// Add an entry to the stack, checking bounds. +static void Stack_push(Stack * self, tipId_t x) { + if(self->nused == self->dim) { + fprintf(stderr, "%s:%s:%d ERR: buffer overflow\n", + __FILE__, __func__, __LINE__); + exit(EXIT_FAILURE); + } + self->buff[self->nused++] = x; +} + +/// Call as generatePatterns(0, npops, stk, 0); Recursive function, +/// which generates all legal site patterns and pushes them onto a +/// stack. +static void +generatePatterns(int bit, int npops, Stack * stk, tipId_t pat, int doSing) { + assert(sizeof(tipId_t) < sizeof(unsigned long long)); + if(bit == npops) { + // Recursion stops here. If current pattern is + // legal, then push it onto the stack. Then return. + + // Exclude patterns with all bits on, or all bits off. + if(pat == 0 || pat == (1ULL << npops) - 1ULL) + return; + // Exclude singleton patterns unless "doSing" is true. + if(!doSing && isPow2(pat)) + return; + Stack_push(stk, pat); + return; + } + tipId_t on = 1UL << bit; + generatePatterns(bit + 1, npops, stk, pat | on, doSing); // curr bit on + generatePatterns(bit + 1, npops, stk, pat, doSing); // curr bit off +} + +int main(int argc, char **argv) { + int i, j, status, optndx, done; + int doSing = 0; // nonzero means use singleton site patterns + long bootreps = 0; + double conf = 0.95; // confidence level + long blocksize = 500; + StrInt *strint = StrInt_new(); + char bootfname[FILENAMESIZE] = { '\0' }; + char errbuff[100] = { '\0' }; + const char *logfname = "sitepat.log"; + int logMismatch = 0, logAA = 0; + FILE *logfile = NULL; + + static struct option myopts[] = { + // {char *name, int has_arg, int *flag, int val} + {"bootfile", required_argument, 0, 'f'}, + {"bootreps", required_argument, 0, 'r'}, + {"blocksize", required_argument, 0, 'b'}, + {"singletons", no_argument, 0, '1'}, + {"logMismatch", no_argument, 0, 'm'}, + {"logAA", no_argument, 0, 'A'}, + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'V'}, + {NULL, 0, NULL, 0} + }; + + // command line arguments + for(;;) { + i = getopt_long(argc, argv, "b:c:f:hr:t:mAv1", myopts, &optndx); + if(i == -1) + break; + switch (i) { + case ':': + case '?': + usage(); + break; + case 'b': + blocksize = strtod(optarg, NULL); + if(blocksize <= 0) { + fprintf(stderr, + "%s:%d: bad argument to -b or --blocksize: \"%s\"\n", + __FILE__, __LINE__, optarg); + usage(); + } + break; + case 'f': + status = snprintf(bootfname, sizeof bootfname, "%s", optarg); + if(status >= sizeof bootfname) { + fprintf(stderr, "%s:%d: ERR: Filename %s is too large." + " Max: %zu\n", + __FILE__, __LINE__, optarg, sizeof(bootfname) - 1); + exit(EXIT_FAILURE); + } + break; + case 'V': + printf("sitepat version %s\n", VERSION); + return 0; + case 'h': + usage(); + break; + case 'r': + bootreps = strtol(optarg, NULL, 10); + break; + case '1': + doSing = 1; + break; + case 'm': + logMismatch = 1; + break; + case 'A': + logAA = 1; + break; + default: + usage(); + } + } + + // remaining options: input files + int n = argc - optind; // number of input files + int m = n-1; // number excluding outgroup + if(n == 0) + usage(); + + char *poplbl[n]; + char *fname[n]; + LblNdx lndx; + LblNdx_init(&lndx); + RAFReader *r[n]; + + // Number of inputs can't exceed number of bits in an object of + // type tipId_t. + if(m > 8 * sizeof(tipId_t)) { + fprintf(stderr, "Error: %d input files. Max is %lu.\n", + n, 8*sizeof(tipId_t) + 1); + usage(); + } + // Parse remaining arguments, each of which should be of form + // x=foo, where x is an arbitrary label and foo is the name of an + // input file. Last label must be "outgroup". + for(i = 0; i < n; ++i) { + fname[i] = poplbl[i] = argv[i + optind]; + (void) strsep(fname + i, "="); + if(fname[i] == NULL + || poplbl[i] == NULL + || strlen(poplbl[i]) == 0 + || strlen(fname[i]) == 0 || strchr(poplbl[i], ':') != NULL) + usage(); + if(i < m) + LblNdx_addSamples(&lndx, 1, poplbl[i]); + r[i] = RAFReader_new(fname[i]); + } + if(0 != strcmp("outgroup", poplbl[n-1])) { + fprintf(stderr,"%s:%d: last label is \"%s\"" + " instead of \"outgroup\".\n", + __FILE__,__LINE__, poplbl[n-1]); + usage(); + } + + if(logMismatch || logAA) { + logfile = fopen(logfname, "w"); + if(logfile == NULL) { + fprintf(stderr, "Can't write to file \"%s\".\n", logfname); + exit(EXIT_FAILURE); + } + } + + // Default boot file name + if(bootfname[0] == '\0') { + const char *defName = "sitepat.boot"; + status = snprintf(bootfname, sizeof bootfname, "%s", defName); + if(status >= sizeof bootfname) { + fprintf(stderr, "%s:%d: ERR: Filename %s is too large." + " Max: %zu\n", + __FILE__, __LINE__, defName, sizeof(bootfname) - 1); + exit(EXIT_FAILURE); + } + } + + printf("# sitepat version %s\n", VERSION); + printf("# Population labels:\n"); + for(i = 0; i < n; ++i) + printf("# %8s=%s\n", poplbl[i], fname[i]); + + // make sure labels are all different + for(i = 1; i < n; ++i) + for(j = 0; j < i; ++j) + if(0 == strcmp(poplbl[i], poplbl[j])) { + fprintf(stderr, "ERR: duplicate labels on command line.\n"); + fprintf(stderr, " duplicated label: %s\n", poplbl[i]); + exit(EXIT_FAILURE); + } + + unsigned long npat = (1UL << m) - 2UL; // number of site patterns + if(!doSing) + npat -= m; + printf("# %s singleton site patterns.\n", + (doSing ? "Including" : "Excluding")); + printf("# Number of site patterns: %lu\n", npat); + tipId_t pat[npat]; + double patCount[npat]; + int lblsize = 100; + char lblbuff[lblsize]; + memset(patCount, 0, sizeof(patCount)); + + { + // Stack is a interface to array "pat". + Stack *stk = Stack_new(npat, pat); + + // Put site patterns into array "pat". + generatePatterns(0, m, stk, 0, doSing); + Stack_free(stk); + } + + // Sort site patterns. Major sort is by number of "on" bits, + // so that doubleton patterns come first, then tripletons, ets. + // Secondary sort is by order in which labels are listed + // on the command line. + qsort(pat, (size_t) npat, sizeof(pat[0]), compare_tipId); + fflush(stdout); + + // Used by bootstrap + Boot *boot = NULL; + int nchr = 0; + char prev[RAFSTRSIZE], chr[RAFSTRSIZE] = { '\0' }; + long nsnp[MAXCHR]; + memset(nsnp, 0, sizeof nsnp); + + // Read the data to get dimensions: number of chromosomes and + // number of snps per chromosome. Then use these dimensions to + // allocate a bootstrap object. + if(bootreps > 0) { + fprintf(stderr, "Doing 1st pass through data to get dimensions...\n"); + + // First pass through data sets values of + // nchr + // nsnp[i] {i=0..nchr-1} + done=0; + while(!done) { + status = RAFReader_multiNext(n, r); + switch(status) { + case 0: + break; + case EOF: + done=1; + continue; + case REF_MISMATCH: + case MULTIPLE_ALT: + case NO_ANCESTRAL_ALLELE: + continue; + default: + // something wrong. + mystrerror_r(status, errbuff, sizeof errbuff); + fprintf(stderr,"%s:%d: input error (%s)\n", + __FILE__,__LINE__, errbuff); + exit(EXIT_FAILURE); + } + + assert(strlen(RAFReader_chr(r[0])) < sizeof prev); + strcpy(prev, chr); + strcpy(chr, RAFReader_chr(r[0])); + int diff = strcmp(prev, chr); + if(diff != 0) { + StrInt_insert(strint, chr, nchr); + nsnp[nchr] = 1; + ++nchr; + } else + ++nsnp[nchr - 1]; + } + + for(i = 0; i < n; ++i) { + status = RAFReader_rewind(r[i]); + if(status) { + fprintf(stderr, "%s:%d: ERR: can't rewind input stream.\n", + __FILE__, __LINE__); + fprintf(stderr, " If --bootreps > 0, inputs must be" + " files, not pipes.\n"); + exit(EXIT_FAILURE); + } + } + + // Allocate Boot structure + gsl_rng *rng = gsl_rng_alloc(gsl_rng_taus); + gsl_rng_set(rng, (unsigned long) time(NULL)); + boot = Boot_new(nchr, nsnp, bootreps, npat, blocksize, rng); + gsl_rng_free(rng); + CHECKMEM(boot); + } + + unsigned long nsites = 0, nbadaa = 0, nbadref=0, nmultalt=0; + long snpndx = -1; + + // Iterate through raf files + fprintf(stderr, "Doing %s pass through data to tabulate patterns..\n", + bootreps > 0 ? "2nd" : "single"); + int chrndx = -1, currChr = INT_MAX; + RAFReader_clearChromosomes(n, r); + done=0; + while( !done ) { + status = RAFReader_multiNext(n, r); + switch(status) { + case 0: + ++nsites; + break; + case EOF: + done=1; + continue; + case REF_MISMATCH: + ++nsites; + ++nbadref; + if(logMismatch) { + fprintf(logfile,"REF mismatch:\n"); + RAFReader_printArray(n, r, logfile); + } + continue; + case MULTIPLE_ALT: + ++nsites; + ++nmultalt; + continue; + case NO_ANCESTRAL_ALLELE: + ++nsites; + ++nbadaa; + if(logAA) { + fprintf(logfile,"Uncallable AA:\n"); + RAFReader_printArray(n, r, logfile); + } + continue; + default: + // something wrong. + mystrerror_r(status, errbuff, sizeof errbuff); + fprintf(stderr,"%s:%d: input error (%s)\n", + __FILE__,__LINE__, errbuff); + exit(EXIT_FAILURE); + } + + if(bootreps > 0) { + // chrndx is index of current chromosome + errno = 0; + chrndx = StrInt_get(strint, RAFReader_chr(r[0])); + if(errno) { + fprintf(stderr, + "%s:%d: ERR: missing index for chromosome: %s\n", + __FILE__, __LINE__, RAFReader_chr(r[0])); + exit(EXIT_FAILURE); + } + if(chrndx != currChr) { + currChr = chrndx; + snpndx = 0; + } else + ++snpndx; + +#ifndef NDEBUG + assert(snpndx < nsnp[chrndx]); +#endif + } + // p and q are frequencies of derived and ancestral alleles + double p[m], q[m]; + for(j = 0; j < m; ++j) { + p[j] = RAFReader_daf(r[j]); // derived allele freq + q[j] = 1.0 - p[j]; + } + + // Contribution of current snp to each site pattern. Inner + // loop considers each bit in current pattern. If that bit is + // on, multiply z by the derived allele frequency, p. If + // that bit is off, multiply by q=1-p. In the end, z is Prod + // p[j]^bit[j] * q[j]^(1-bit[j]) where bit[j] is the value (0 + // or 1) of the j'th bit. + for(i = 0; i < npat; ++i) { + tipId_t pattern = pat[i]; + double z = 1.0; + for(j = 0; j < m; ++j) { + if(pattern & 1u) + z *= p[j]; + else + z *= q[j]; + pattern >>= 1u; + } + if(!isfinite(z)) { + fprintf(stderr, "%s:%d nonfinite z=%lf\n", + __FILE__, __LINE__, z); + fprintf(stderr, " pattern=%d\n", pat[i]); + for(j = 0; j < m; ++j) + fprintf(stderr, " %d: p=%lf q=%lf\n", j, p[j], q[j]); + } + assert(0 == (pattern & 1)); + patCount[i] += z; + if(bootreps > 0) { + assert(snpndx >= 0); + assert(chrndx >= 0); + Boot_add(boot, chrndx, snpndx, i, z); + } + } +#ifndef NDEBUG + if(bootreps > 0) + Boot_sanityCheck(boot, __FILE__, __LINE__); +#endif + } + printf("# Aligned sites : %lu\n", nsites); + if(nbadref) + printf("# Disagreements about ref allele : %lu\n", nbadref); + if(nmultalt) + printf("# Sites with multiple alt alleles: %lu\n", nmultalt); + if(nbadaa) + printf("# Undetermined ancestral allele : %lu\n", nbadaa); + printf("# Sites used : %lu\n", + nsites - nbadaa - nbadref - nmultalt); + + // boottab[i][j] is the count of the j'th site pattern + // in the i'th bootstrap replicate. + double bootvals[bootreps]; + double boottab[bootreps][npat]; + memset(boottab, 0, sizeof boottab); + + if(bootreps > 0) { + printf("# %s = %s\n", "bootstrap output file", bootfname); + printf("# %s = %4.2lf%%\n", "confidence level", 100 * conf); +#ifndef NDEBUG + Boot_sanityCheck(boot, __FILE__, __LINE__); +#endif + // put site pattern counts into matrix boottab. + for(i = 0; i < bootreps; ++i) + Boot_aggregate(boot, i, npat, boottab[i]); + + // write an output file for each bootstrap replicate + for(j = 0; j < bootreps; ++j) { + char buff[FILENAMESIZE + 3]; + status = snprintf(buff, sizeof buff, "%s%03d", bootfname, j); + if(status >= sizeof buff) + DIE("buffer overflow in snprintf"); + + FILE *fp = fopen(buff, "w"); + if(fp == NULL) + DIE("bad fopen"); + fprintf(fp, "# %13s %20s", "SitePat", "E[count]\n"); + for(i = 0; i < npat; ++i) { + fprintf(fp, "%15s %20.7lf\n", + patLbl(lblsize, lblbuff, pat[i], &lndx), + boottab[j][i]); + } + fclose(fp); + } + } + // print labels and binary representation of site patterns + printf("# %13s %20s", "SitePat", "E[count]"); + if(bootreps > 0) + printf(" %15s %15s", "loBnd", "hiBnd"); + putchar('\n'); + for(i = 0; i < npat; ++i) { + printf("%15s %20.7lf", + patLbl(lblsize, lblbuff, pat[i], &lndx), patCount[i]); + if(bootreps > 0) { + double lowBnd, highBnd; + for(j = 0; j < bootreps; ++j) + bootvals[j] = boottab[j][i]; + confidenceBounds(&lowBnd, &highBnd, conf, bootreps, bootvals); + printf(" %15.7lf %15.7lf", lowBnd, highBnd); + } + putchar('\n'); + } + + for(i = 0; i < n; ++i) + RAFReader_free(r[i]); + if(bootreps > 0) + Boot_free(boot); + StrInt_free(strint); + if(logfile) + fclose(logfile); + fprintf(stderr, "sitepat is finished\n"); + return 0; +} diff --git a/src/tabpat.c b/src/tabpat.c index f55afe5d..ecac2417 100644 --- a/src/tabpat.c +++ b/src/tabpat.c @@ -26,8 +26,6 @@ replicate into a separate file. # of SNPs per block in moving-blocks bootstrap. Def: 0. -1 or --singletons Use singleton site patterns - -m or --logMismatch - log AA/DA mismatches to tabpat.log -F or --logFixed log fixed sites to tabpat.log -a or --logAll @@ -52,7 +50,10 @@ putting the results into `obs.txt`. Here, "x", "y", "n", and "d" are labels that will be used to identify site patterns in the output. For example, site pattern "x:y" refers to the pattern in which the derived allele is present haploid samples -from "x" and "y" but not on those from other populations. +from "x" and "y" but not on those from other populations. The order of +the command-line arguments determines the order in which labels are +sorted on output. Given the command line above, we would get a site +pattern labeled "x:y:d" rather than, say, "y:x:d". The output looks like this: @@ -137,6 +138,8 @@ Systems Consortium License, which can be found in file "LICENSE". #include "misc.h" #include "strint.h" #include "typedefs.h" +#include "version.h" +#include "error.h" #include #include #include @@ -182,9 +185,9 @@ static void usage(void) { tellopt("-b or --blocksize ", "# of SNPs per block in moving-blocks bootstrap. Def: 0."); tellopt("-1 or --singletons", "Use singleton site patterns"); - tellopt("-m or --logMismatch", "log AA/DA mismatches to tabpat.log"); tellopt("-F or --logFixed", "log fixed sites to tabpat.log"); tellopt("-a or --logAll", "log all sites to tabpat.log"); + tellopt("--version", "Print version and exit"); tellopt("-h or --help", "Print this message"); exit(1); } @@ -241,15 +244,16 @@ generatePatterns(int bit, int npops, Stack * stk, tipId_t pat, int doSing) { } int main(int argc, char **argv) { - int i, j, status, optndx; + int i, j, status, optndx, done; int doSing = 0; // nonzero means use singleton site patterns long bootreps = 0; double conf = 0.95; // confidence level long blocksize = 500; StrInt *strint = StrInt_new(); char bootfname[FILENAMESIZE] = { '\0' }; + char errbuff[100] = { '\0' }; const char *logfname = "tabpat.log"; - int logMismatch = 0, logFixed = 0, logAll = 0; + int logFixed = 0, logAll = 0; FILE *logfile = NULL; static struct option myopts[] = { @@ -258,17 +262,17 @@ int main(int argc, char **argv) { {"bootreps", required_argument, 0, 'r'}, {"blocksize", required_argument, 0, 'b'}, {"singletons", no_argument, 0, '1'}, - {"logMismatch", no_argument, 0, 'm'}, {"logFixed", no_argument, 0, 'F'}, {"logAll", no_argument, 0, 'a'}, {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'V'}, // {"threads", required_argument, 0, 't'}, {NULL, 0, NULL, 0} }; // command line arguments for(;;) { - i = getopt_long(argc, argv, "ab:c:f:hr:t:mFv1", myopts, &optndx); + i = getopt_long(argc, argv, "ab:c:f:hr:t:Fv1", myopts, &optndx); if(i == -1) break; switch (i) { @@ -297,15 +301,15 @@ int main(int argc, char **argv) { case 'h': usage(); break; + case 'V': + printf("tabpat version %s\n", VERSION); + return 0; case 'r': bootreps = strtol(optarg, NULL, 10); break; case '1': doSing = 1; break; - case 'm': - logMismatch = 1; - break; case 'F': logFixed = 1; break; @@ -350,7 +354,7 @@ int main(int argc, char **argv) { r[i] = DAFReader_new(fname[i]); } - if(logMismatch || logFixed || logAll) { + if(logFixed || logAll) { logfile = fopen(logfname, "w"); if(logfile == NULL) { fprintf(stderr, "Can't write to file \"%s\".\n", logfname); @@ -369,6 +373,7 @@ int main(int argc, char **argv) { } } + printf("# tabpat version %s\n", VERSION); printf("# Population labels:\n"); for(i = 0; i < n; ++i) printf("# %4s=%s\n", poplbl[i], fname[i]); @@ -427,12 +432,25 @@ int main(int argc, char **argv) { // First pass through data sets values of // nchr // nsnp[i] {i=0..nchr-1} - while(EOF != DAFReader_multiNext(n, r)) { - - // Skip loci at which data sets disagree about which allele - // is derived and which ancestral. - if(!DAFReader_allelesMatch(n, r)) + done = 0; + while(!done) { + status = DAFReader_multiNext(n, r); + switch(status) { + case 0: + break; + case EOF: + done=1; continue; + case ALLELE_MISMATCH: + case NO_ANCESTRAL_ALLELE: + continue; + default: + // something wrong + mystrerror_r(status, errbuff, sizeof errbuff); + fprintf(stderr,"%s:%d: input error (%s)\n", + __FILE__,__LINE__, errbuff); + exit(EXIT_FAILURE); + } assert(strlen(DAFReader_chr(r[0])) < sizeof prev); strcpy(prev, chr); @@ -473,21 +491,27 @@ int main(int argc, char **argv) { bootreps > 0 ? "2nd" : "single"); int chrndx = -1, currChr = INT_MAX; DAFReader_clearChromosomes(n, r); - while(EOF != DAFReader_multiNext(n, r)) { - - ++nsites; // count sites that align across populations. - // Skip loci at which data sets disagree about which allele - // is derived and which ancestral. - if(!DAFReader_allelesMatch(n, r)) { - if(logMismatch) { - assert(logfile); - fprintf(logfile, "Mismatch\n"); - DAFReader_printHdr(logfile); - for(i = 0; i < n; ++i) - DAFReader_print(r[i], logfile); - } + done=0; + while(!done) { + status = DAFReader_multiNext(n, r); + switch(status) { + case 0: + ++nsites; + break; + case EOF: + done=1; + continue; + case ALLELE_MISMATCH: + case NO_ANCESTRAL_ALLELE: ++nbadaa; + ++nsites; continue; + default: + // something wrong + mystrerror_r(status, errbuff, sizeof errbuff); + fprintf(stderr,"%s:%d: input error (%s)\n", + __FILE__,__LINE__, errbuff); + exit(EXIT_FAILURE); } if(bootreps > 0) { @@ -511,25 +535,10 @@ int main(int argc, char **argv) { #endif } // p and q are frequencies of derived and ancestral alleles - double p[n], q[n], minp, maxp; - minp = 1.0; - maxp = 0.0; + double p[n], q[n]; for(j = 0; j < n; ++j) { p[j] = DAFReader_daf(r[j]); // derived allele freq - q[j] = 1 - p[j]; - minp = fmin(minp, p[j]); - maxp = fmax(maxp, p[j]); - } - if(maxp == 0.0 || minp == 1.0) { - if(logFixed) { - assert(logfile); - fprintf(logfile, "All p=%lf\n", p[0]); - DAFReader_printHdr(logfile); - for(i = 0; i < n; ++i) - DAFReader_print(r[i], logfile); - } - ++nfixed; - continue; + q[j] = 1.0 - p[j]; } if(logAll) { @@ -575,7 +584,7 @@ int main(int argc, char **argv) { } printf("# Sites aligned across all populations: %lu\n", nsites); if(nbadaa) - printf("# Disagreements about ancestral allele: %lu\n", nbadaa); + printf("# Disagreements about alleles : %lu\n", nbadaa); if(nfixed) printf("# Monomorphic sites : %lu\n", nfixed); printf("# Sites used : %lu\n", diff --git a/src/tinyexpr.c b/src/tinyexpr.c new file mode 100644 index 00000000..9356721b --- /dev/null +++ b/src/tinyexpr.c @@ -0,0 +1,826 @@ +/* + * TINYEXPR - Tiny recursive descent parser and evaluation engine in C + * + * Copyright (c) 2015, 2016 Lewis Van Winkle + * + * http://CodePlea.com + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgement in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* COMPILE TIME OPTIONS */ + +/* Exponentiation associativity: +For a^b^c = (a^b)^c and -a^b = (-a)^b do nothing. +For a^b^c = a^(b^c) and -a^b = -(a^b) uncomment the next line.*/ + +/* #define TE_POW_FROM_RIGHT */ + +/* Logarithms +For log = base 10 log do nothing +For log = natural log uncomment the next line. */ + +/* #define TE_NAT_LOG */ + +#include "tinyexpr.h" +#include +#include +#include +#include +#include +#include +#include + +#ifndef NAN +#define NAN (0.0/0.0) +#endif + +#ifndef INFINITY +#define INFINITY (1.0/0.0) +#endif + +typedef double (*te_fun2) (double, double); + +enum { + TOK_NULL = TE_CLOSURE7 + 1, TOK_ERROR, TOK_END, TOK_SEP, + TOK_OPEN, TOK_CLOSE, TOK_NUMBER, TOK_VARIABLE, TOK_INFIX +}; + +enum { TE_CONSTANT = 1 }; + +typedef struct state { + const char *start; + const char *next; + int type; + union { + double value; + const double *bound; + void *function; + }; + void *context; + + const te_variable *lookup; + int lookup_len; +} state; + +#define TYPE_MASK(TYPE) ((TYPE)&0x0000001F) + +#define IS_PURE(TYPE) (((TYPE) & TE_FLAG_PURE) != 0) +#define IS_FUNCTION(TYPE) (((TYPE) & TE_FUNCTION0) != 0) +#define IS_CLOSURE(TYPE) (((TYPE) & TE_CLOSURE0) != 0) +#define ARITY(TYPE) ( ((TYPE) & (TE_FUNCTION0 | TE_CLOSURE0)) ? ((TYPE) & 0x00000007) : 0 ) +#define NEW_EXPR(type, ...) new_expr((type), (const te_expr*[]){__VA_ARGS__}) + +void te_free_parameters(te_expr * n); +void next_token(state * s); +static double pi(void); +static double e(void); + +static te_expr *new_expr(const int type, const te_expr * parameters[]) { + const int arity = ARITY(type); + const int psize = sizeof(void *) * arity; + const int size = + (sizeof(te_expr) - sizeof(void *)) + psize + + (IS_CLOSURE(type) ? sizeof(void *) : 0); + te_expr *ret = malloc(size); + memset(ret, 0, size); + if(arity && parameters) { + memcpy(ret->parameters, parameters, psize); + } + ret->type = type; + ret->bound = 0; + return ret; +} + +void te_free_parameters(te_expr * n) { + if(!n) + return; + switch (TYPE_MASK(n->type)) { + case TE_FUNCTION7: + case TE_CLOSURE7: + te_free(n->parameters[6]); + case TE_FUNCTION6: + case TE_CLOSURE6: + te_free(n->parameters[5]); + case TE_FUNCTION5: + case TE_CLOSURE5: + te_free(n->parameters[4]); + case TE_FUNCTION4: + case TE_CLOSURE4: + te_free(n->parameters[3]); + case TE_FUNCTION3: + case TE_CLOSURE3: + te_free(n->parameters[2]); + case TE_FUNCTION2: + case TE_CLOSURE2: + te_free(n->parameters[1]); + case TE_FUNCTION1: + case TE_CLOSURE1: + te_free(n->parameters[0]); + } +} + +void te_free(te_expr * n) { + if(!n) + return; + te_free_parameters(n); + free(n); +} + +static double pi(void) { + return 3.14159265358979323846; +} +static double e(void) { + return 2.71828182845904523536; +} +static double fac(double a) { /* simplest version of fac */ + if(a < 0.0) + return NAN; + if(a > UINT_MAX) + return INFINITY; + unsigned int ua = (unsigned int) (a); + unsigned long int result = 1, i; + for(i = 1; i <= ua; i++) { + if(i > ULONG_MAX / result) + return INFINITY; + result *= i; + } + return (double) result; +} +static double ncr(double n, double r) { + if(n < 0.0 || r < 0.0 || n < r) + return NAN; + if(n > UINT_MAX || r > UINT_MAX) + return INFINITY; + unsigned long int un = (unsigned int) (n), ur = (unsigned int) (r), i; + unsigned long int result = 1; + if(ur > un / 2) + ur = un - ur; + for(i = 1; i <= ur; i++) { + if(result > ULONG_MAX / (un - ur + i)) + return INFINITY; + result *= un - ur + i; + result /= i; + } + return result; +} +static double npr(double n, double r) { + return ncr(n, r) * fac(r); +} + +static const te_variable functions[] = { + /* must be in alphabetical order */ + {"abs", fabs, TE_FUNCTION1 | TE_FLAG_PURE, 0}, + {"acos", acos, TE_FUNCTION1 | TE_FLAG_PURE, 0}, + {"asin", asin, TE_FUNCTION1 | TE_FLAG_PURE, 0}, + {"atan", atan, TE_FUNCTION1 | TE_FLAG_PURE, 0}, + {"atan2", atan2, TE_FUNCTION2 | TE_FLAG_PURE, 0}, + {"ceil", ceil, TE_FUNCTION1 | TE_FLAG_PURE, 0}, + {"cos", cos, TE_FUNCTION1 | TE_FLAG_PURE, 0}, + {"cosh", cosh, TE_FUNCTION1 | TE_FLAG_PURE, 0}, + {"e", e, TE_FUNCTION0 | TE_FLAG_PURE, 0}, + {"exp", exp, TE_FUNCTION1 | TE_FLAG_PURE, 0}, + {"fac", fac, TE_FUNCTION1 | TE_FLAG_PURE, 0}, + {"floor", floor, TE_FUNCTION1 | TE_FLAG_PURE, 0}, + {"ln", log, TE_FUNCTION1 | TE_FLAG_PURE, 0}, +#ifdef TE_NAT_LOG + {"log", log, TE_FUNCTION1 | TE_FLAG_PURE, 0}, +#else + {"log", log10, TE_FUNCTION1 | TE_FLAG_PURE, 0}, +#endif + {"log10", log10, TE_FUNCTION1 | TE_FLAG_PURE, 0}, + {"ncr", ncr, TE_FUNCTION2 | TE_FLAG_PURE, 0}, + {"npr", npr, TE_FUNCTION2 | TE_FLAG_PURE, 0}, + {"pi", pi, TE_FUNCTION0 | TE_FLAG_PURE, 0}, + {"pow", pow, TE_FUNCTION2 | TE_FLAG_PURE, 0}, + {"sin", sin, TE_FUNCTION1 | TE_FLAG_PURE, 0}, + {"sinh", sinh, TE_FUNCTION1 | TE_FLAG_PURE, 0}, + {"sqrt", sqrt, TE_FUNCTION1 | TE_FLAG_PURE, 0}, + {"tan", tan, TE_FUNCTION1 | TE_FLAG_PURE, 0}, + {"tanh", tanh, TE_FUNCTION1 | TE_FLAG_PURE, 0}, + {0, 0, 0, 0} +}; + +static const te_variable *find_builtin(const char *name, int len) { + int imin = 0; + int imax = sizeof(functions) / sizeof(te_variable) - 2; + + /*Binary search. */ + while(imax >= imin) { + const int i = (imin + ((imax - imin) / 2)); + int c = strncmp(name, functions[i].name, len); + if(!c) + c = '\0' - functions[i].name[len]; + if(c == 0) { + return functions + i; + } else if(c > 0) { + imin = i + 1; + } else { + imax = i - 1; + } + } + + return 0; +} + +static const te_variable *find_lookup(const state * s, const char *name, + int len) { + int iters; + const te_variable *var; + if(!s->lookup) + return 0; + + for(var = s->lookup, iters = s->lookup_len; iters; ++var, --iters) { + if(strncmp(name, var->name, len) == 0 && var->name[len] == '\0') { + return var; + } + } + return 0; +} + +static double add(double a, double b) { + return a + b; +} +static double sub(double a, double b) { + return a - b; +} +static double mul(double a, double b) { + return a * b; +} +static double divide(double a, double b) { + return a / b; +} +static double negate(double a) { + return -a; +} +static double comma(double a, double b) { + (void) a; + return b; +} + +void next_token(state * s) { + s->type = TOK_NULL; + + do { + + if(!*s->next) { + s->type = TOK_END; + return; + } + + /* Try reading a number. */ + char *end; + errno = 0; + s->value = strtod(s->next, &end); + if(errno) { + /* overflow or underflow */ + s->type = TOK_ERROR; + s->next = end; + }else if(end != s->next) { + s->next = end; + s->type = TOK_NUMBER; + } else { + /* Look for a variable or builtin function call. */ + if(isalpha(*s->next)) { + const char *start; + start = s->next; + while(isalnum(*s->next) || *s->next == '_') + s->next++; + + const te_variable *var = + find_lookup(s, start, s->next - start); + if(!var) + var = find_builtin(start, s->next - start); + + if(!var) { + s->type = TOK_ERROR; + } else { + switch (TYPE_MASK(var->type)) { + case TE_VARIABLE: + s->type = TOK_VARIABLE; + s->bound = var->address; + break; + + case TE_CLOSURE0: + case TE_CLOSURE1: + case TE_CLOSURE2: + case TE_CLOSURE3: + case TE_CLOSURE4: + case TE_CLOSURE5: + case TE_CLOSURE6: + case TE_CLOSURE7: + s->context = var->context; + + case TE_FUNCTION0: + case TE_FUNCTION1: + case TE_FUNCTION2: + case TE_FUNCTION3: + case TE_FUNCTION4: + case TE_FUNCTION5: + case TE_FUNCTION6: + case TE_FUNCTION7: + s->type = var->type; + s->function = var->address; + break; + } + } + + } else { + /* Look for an operator or special character. */ + switch (s->next++[0]) { + case '+': + s->type = TOK_INFIX; + s->function = add; + break; + case '-': + s->type = TOK_INFIX; + s->function = sub; + break; + case '*': + s->type = TOK_INFIX; + s->function = mul; + break; + case '/': + s->type = TOK_INFIX; + s->function = divide; + break; + case '^': + s->type = TOK_INFIX; + s->function = pow; + break; + case '%': + s->type = TOK_INFIX; + s->function = fmod; + break; + case '(': + s->type = TOK_OPEN; + break; + case ')': + s->type = TOK_CLOSE; + break; + case ',': + s->type = TOK_SEP; + break; + case ' ': + case '\t': + case '\n': + case '\r': + break; + default: + s->type = TOK_ERROR; + break; + } + } + } + } while(s->type == TOK_NULL); +} + +static te_expr *list(state * s); +static te_expr *expr(state * s); +static te_expr *power(state * s); + +static te_expr *base(state * s) { + /* = | | {"(" ")"} | + | "(" {"," } ")" | + "(" ")" */ + te_expr *ret; + int arity; + + switch (TYPE_MASK(s->type)) { + case TOK_NUMBER: + ret = new_expr(TE_CONSTANT, 0); + ret->value = s->value; + next_token(s); + break; + + case TOK_VARIABLE: + ret = new_expr(TE_VARIABLE, 0); + ret->bound = s->bound; + next_token(s); + break; + + case TE_FUNCTION0: + case TE_CLOSURE0: + ret = new_expr(s->type, 0); + ret->function = s->function; + if(IS_CLOSURE(s->type)) + ret->parameters[0] = s->context; + next_token(s); + if(s->type == TOK_OPEN) { + next_token(s); + if(s->type != TOK_CLOSE) { + s->type = TOK_ERROR; + } else { + next_token(s); + } + } + break; + + case TE_FUNCTION1: + case TE_CLOSURE1: + ret = new_expr(s->type, 0); + ret->function = s->function; + if(IS_CLOSURE(s->type)) + ret->parameters[1] = s->context; + next_token(s); + ret->parameters[0] = power(s); + break; + + case TE_FUNCTION2: + case TE_FUNCTION3: + case TE_FUNCTION4: + case TE_FUNCTION5: + case TE_FUNCTION6: + case TE_FUNCTION7: + case TE_CLOSURE2: + case TE_CLOSURE3: + case TE_CLOSURE4: + case TE_CLOSURE5: + case TE_CLOSURE6: + case TE_CLOSURE7: + arity = ARITY(s->type); + + ret = new_expr(s->type, 0); + ret->function = s->function; + if(IS_CLOSURE(s->type)) + ret->parameters[arity] = s->context; + next_token(s); + + if(s->type != TOK_OPEN) { + s->type = TOK_ERROR; + } else { + int i; + for(i = 0; i < arity; i++) { + next_token(s); + ret->parameters[i] = expr(s); + if(s->type != TOK_SEP) { + break; + } + } + if(s->type != TOK_CLOSE || i != arity - 1) { + s->type = TOK_ERROR; + } else { + next_token(s); + } + } + + break; + + case TOK_OPEN: + next_token(s); + ret = list(s); + if(s->type != TOK_CLOSE) { + s->type = TOK_ERROR; + } else { + next_token(s); + } + break; + + default: + ret = new_expr(0, 0); + s->type = TOK_ERROR; + ret->value = NAN; + break; + } + + return ret; +} + +static te_expr *power(state * s) { + /* = {("-" | "+")} */ + int sign = 1; + while(s->type == TOK_INFIX && (s->function == add || s->function == sub)) { + if(s->function == sub) + sign = -sign; + next_token(s); + } + + te_expr *ret; + + if(sign == 1) { + ret = base(s); + } else { + ret = NEW_EXPR(TE_FUNCTION1 | TE_FLAG_PURE, base(s)); + ret->function = negate; + } + + return ret; +} + +#ifdef TE_POW_FROM_RIGHT +static te_expr *factor(state * s) { + /* = {"^" } */ + te_expr *ret = power(s); + + int neg = 0; + te_expr *insertion = 0; + + if(ret->type == (TE_FUNCTION1 | TE_FLAG_PURE) && ret->function == negate) { + te_expr *se = ret->parameters[0]; + free(ret); + ret = se; + neg = 1; + } + + while(s->type == TOK_INFIX && (s->function == pow)) { + te_fun2 t = s->function; + next_token(s); + + if(insertion) { + /* Make exponentiation go right-to-left. */ + te_expr *insert = + NEW_EXPR(TE_FUNCTION2 | TE_FLAG_PURE, + insertion->parameters[1], power(s)); + insert->function = t; + insertion->parameters[1] = insert; + insertion = insert; + } else { + ret = NEW_EXPR(TE_FUNCTION2 | TE_FLAG_PURE, ret, power(s)); + ret->function = t; + insertion = ret; + } + } + + if(neg) { + ret = NEW_EXPR(TE_FUNCTION1 | TE_FLAG_PURE, ret); + ret->function = negate; + } + + return ret; +} +#else +static te_expr *factor(state * s) { + /* = {"^" } */ + te_expr *ret = power(s); + + while(s->type == TOK_INFIX && (s->function == pow)) { + te_fun2 t = s->function; + next_token(s); + ret = NEW_EXPR(TE_FUNCTION2 | TE_FLAG_PURE, ret, power(s)); + ret->function = t; + } + + return ret; +} +#endif + +static te_expr *term(state * s) { + /* = {("*" | "/" | "%") } */ + te_expr *ret = factor(s); + + while(s->type == TOK_INFIX + && (s->function == mul || s->function == divide + || s->function == fmod)) { + te_fun2 t = s->function; + next_token(s); + ret = NEW_EXPR(TE_FUNCTION2 | TE_FLAG_PURE, ret, factor(s)); + ret->function = t; + } + + return ret; +} + +static te_expr *expr(state * s) { + /* = {("+" | "-") } */ + te_expr *ret = term(s); + + while(s->type == TOK_INFIX && (s->function == add || s->function == sub)) { + te_fun2 t = s->function; + next_token(s); + ret = NEW_EXPR(TE_FUNCTION2 | TE_FLAG_PURE, ret, term(s)); + ret->function = t; + } + + return ret; +} + +static te_expr *list(state * s) { + /* = {"," } */ + te_expr *ret = expr(s); + + while(s->type == TOK_SEP) { + next_token(s); + ret = NEW_EXPR(TE_FUNCTION2 | TE_FLAG_PURE, ret, expr(s)); + ret->function = comma; + } + + return ret; +} + +#define TE_FUN(...) ((double (*) (__VA_ARGS__)) n->function) +#define M(e) te_eval(n->parameters[e]) + +double te_eval(const te_expr * n) { + if(!n) + return NAN; + + switch (TYPE_MASK(n->type)) { + case TE_CONSTANT: + return n->value; + case TE_VARIABLE: + return *n->bound; + + case TE_FUNCTION0: + case TE_FUNCTION1: + case TE_FUNCTION2: + case TE_FUNCTION3: + case TE_FUNCTION4: + case TE_FUNCTION5: + case TE_FUNCTION6: + case TE_FUNCTION7: + switch (ARITY(n->type)) { + case 0: + return TE_FUN(void) (); + case 1: + return TE_FUN(double) (M(0)); + case 2: + return TE_FUN(double, double) (M(0), M(1)); + case 3: + return TE_FUN(double, double, double) (M(0), M(1), M(2)); + case 4: + return TE_FUN(double, double, double, double) (M(0), M(1), M(2), + M(3)); + case 5: + return TE_FUN(double, double, double, double, double) (M(0), M(1), + M(2), M(3), + M(4)); + case 6: + return TE_FUN(double, double, double, double, double, + double) (M(0), M(1), M(2), M(3), M(4), M(5)); + case 7: + return TE_FUN(double, double, double, double, double, double, + double) (M(0), M(1), M(2), M(3), M(4), M(5), M(6)); + default: + return NAN; + } + + case TE_CLOSURE0: + case TE_CLOSURE1: + case TE_CLOSURE2: + case TE_CLOSURE3: + case TE_CLOSURE4: + case TE_CLOSURE5: + case TE_CLOSURE6: + case TE_CLOSURE7: + switch (ARITY(n->type)) { + case 0: + return TE_FUN(void *) (n->parameters[0]); + case 1: + return TE_FUN(void *, double) (n->parameters[1], M(0)); + case 2: + return TE_FUN(void *, double, double) (n->parameters[2], M(0), + M(1)); + case 3: + return TE_FUN(void *, double, double, double) (n->parameters[3], + M(0), M(1), M(2)); + case 4: + return TE_FUN(void *, double, double, double, + double) (n->parameters[4], M(0), M(1), M(2), M(3)); + case 5: + return TE_FUN(void *, double, double, double, double, + double) (n->parameters[5], M(0), M(1), M(2), M(3), + M(4)); + case 6: + return TE_FUN(void *, double, double, double, double, double, + double) (n->parameters[6], M(0), M(1), M(2), M(3), + M(4), M(5)); + case 7: + return TE_FUN(void *, double, double, double, double, double, + double, double) (n->parameters[7], M(0), M(1), M(2), + M(3), M(4), M(5), M(6)); + default: + return NAN; + } + + default: + return NAN; + } + +} + +#undef TE_FUN +#undef M + +static void optimize(te_expr * n) { + /* Evaluates as much as possible. */ + if(n->type == TE_CONSTANT) + return; + if(n->type == TE_VARIABLE) + return; + + /* Only optimize out functions flagged as pure. */ + if(IS_PURE(n->type)) { + const int arity = ARITY(n->type); + int known = 1; + int i; + for(i = 0; i < arity; ++i) { + optimize(n->parameters[i]); + if(((te_expr *) (n->parameters[i]))->type != TE_CONSTANT) { + known = 0; + } + } + if(known) { + const double value = te_eval(n); + te_free_parameters(n); + n->type = TE_CONSTANT; + n->value = value; + } + } +} + +te_expr *te_compile(const char *expression, const te_variable * variables, + int var_count, int *error) { + state s; + s.start = s.next = expression; + s.lookup = variables; + s.lookup_len = var_count; + + next_token(&s); + te_expr *root = list(&s); + + if(s.type != TOK_END) { + te_free(root); + if(error) { + *error = (s.next - s.start); + if(*error == 0) + *error = 1; + } + return 0; + } else { + optimize(root); + if(error) + *error = 0; + return root; + } +} + +double te_interp(const char *expression, int *error) { + te_expr *n = te_compile(expression, 0, 0, error); + double ret; + if(n) { + ret = te_eval(n); + te_free(n); + } else { + ret = NAN; + } + return ret; +} + +static void pn(const te_expr * n, int depth, FILE *fp) { + int i, arity; + fprintf(fp, "%*s", depth, ""); + + switch (TYPE_MASK(n->type)) { + case TE_CONSTANT: + fprintf(fp,"%f\n", n->value); + break; + case TE_VARIABLE: + fprintf(fp,"bound %p\n", n->bound); + break; + + case TE_FUNCTION0: + case TE_FUNCTION1: + case TE_FUNCTION2: + case TE_FUNCTION3: + case TE_FUNCTION4: + case TE_FUNCTION5: + case TE_FUNCTION6: + case TE_FUNCTION7: + case TE_CLOSURE0: + case TE_CLOSURE1: + case TE_CLOSURE2: + case TE_CLOSURE3: + case TE_CLOSURE4: + case TE_CLOSURE5: + case TE_CLOSURE6: + case TE_CLOSURE7: + arity = ARITY(n->type); + fprintf(fp,"f%d", arity); + for(i = 0; i < arity; i++) { + fprintf(fp," %p", n->parameters[i]); + } + fprintf(fp,"\n"); + for(i = 0; i < arity; i++) { + pn(n->parameters[i], depth + 1, fp); + } + break; + } +} + +void te_print(const te_expr * n, FILE *fp) { + pn(n, 0, fp); +} diff --git a/src/tinyexpr.h b/src/tinyexpr.h new file mode 100644 index 00000000..4f686646 --- /dev/null +++ b/src/tinyexpr.h @@ -0,0 +1,83 @@ +/* + * TINYEXPR - Tiny recursive descent parser and evaluation engine in C + * + * Copyright (c) 2015, 2016 Lewis Van Winkle + * + * http://CodePlea.com + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgement in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#ifndef __TINYEXPR_H__ +#define __TINYEXPR_H__ + +#define TE_NAT_LOG + +#include + +typedef struct te_expr { + int type; + union { + double value; + const double *bound; + void *function; + }; + void *parameters[1]; +} te_expr; + +enum { + TE_VARIABLE = 0, + + TE_FUNCTION0 = 8, TE_FUNCTION1, TE_FUNCTION2, TE_FUNCTION3, + TE_FUNCTION4, TE_FUNCTION5, TE_FUNCTION6, TE_FUNCTION7, + + TE_CLOSURE0 = 16, TE_CLOSURE1, TE_CLOSURE2, TE_CLOSURE3, + TE_CLOSURE4, TE_CLOSURE5, TE_CLOSURE6, TE_CLOSURE7, + + TE_FLAG_PURE = 32 +}; + +typedef struct te_variable { + const char *name; + void *address; + int type; + void *context; +} te_variable; + +/* Parses the input expression, evaluates it, and frees it. */ + +/* Returns NaN on error. */ +double te_interp(const char *expression, int *error); + +/* Parses the input expression and binds variables. */ + +/* Returns NULL on error. */ +te_expr *te_compile(const char *expression, + const te_variable * variables, int var_count, int *error); + +/* Evaluates the expression. */ +double te_eval(const te_expr * n); + +/* Prints debugging information on the syntax tree. */ +void te_print(const te_expr * n, FILE * fp); + +/* Frees the expression. */ + +/* This is safe to call on NULL pointers. */ +void te_free(te_expr * n); + +#endif /*__TINYEXPR_H__*/ diff --git a/src/try.c b/src/try.c index 12fe26f9..680fd69c 100644 --- a/src/try.c +++ b/src/try.c @@ -1,38 +1,15 @@ #include -#include -#include -#include -#include +#include -int main(int argc, char **argv) { +int main(void) { - char buff[100], *leftover; - double x; + int i = INT_MAX; - strcpy(buff, " 1e 4000 "); - errno=0; - x = strtod(buff, &leftover); - if(errno) { - char err_buff[50]; - strerror_r(errno, err_buff, sizeof(err_buff)); - fprintf(stderr,"%s:%d: Can't parse \"%s\" as float (%s).\n", - __FILE__,__LINE__, buff, err_buff); - } - printf("x=%lf\n", x); + printf("max=%d = %e\n", i, (double) i); - if(leftover==buff) { - fprintf(stderr,"%s:%d: Can't parse \"%s\" as float.\n", - __FILE__,__LINE__, buff); - exit(1); - } - while(isspace(*leftover)) - ++leftover; - - if(*leftover != '\0') { - fprintf(stderr, "%s:%d: Extra chars at end of float \"%s\"." - " Token=\"%s\".\n", - __FILE__, __LINE__, leftover, buff); - exit(1); - } - return 0; + i += 1; + + printf("max+1=%d\n", i); + + return 0; } diff --git a/src/typedefs.h b/src/typedefs.h index d58a8a9b..46dccc0c 100644 --- a/src/typedefs.h +++ b/src/typedefs.h @@ -10,6 +10,7 @@ typedef struct Bounds Bounds; typedef struct BranchTab BranchTab; typedef struct Constraint Constraint; typedef struct El El; +typedef struct Exception Exception; typedef struct Gene Gene; typedef struct GPTree GPTree; typedef struct HashTab HashTab; @@ -27,6 +28,7 @@ typedef struct SampNdx SampNdx; typedef struct StrInt StrInt; typedef struct Tokenizer Tokenizer; typedef struct DAFReader DAFReader; +typedef struct RAFReader RAFReader; /// Distinguish between parameters that free, fixed, Gaussian, or /// constrained. Free parameters can be changed during optimization; diff --git a/test/Makefile b/test/Makefile index 596b40bc..c85a9c2d 100644 --- a/test/Makefile +++ b/test/Makefile @@ -3,9 +3,9 @@ vpath %.c ../src opt := -O0 -fno-inline-functions -DDEBUG prof := incl := -I/usr/local/include -I/opt/local/include -I../src -tests := xbinary xboot xbranchtab xdafreader xdiffev xgene \ +tests := xbinary xboot xbranchtab xdafreader xrafreader xdiffev xgene \ xgptree xpopnodetab xjobqueue xlblndx xllrbtree xparkeyval xparse \ - xparstore xpopnode xsimsched xstrint xdtnorm xterm xmisc + xparstore xpopnode xsimsched xstrint xdtnorm xmisc xerror CC := gcc @@ -39,8 +39,10 @@ test : $(tests) -./xboot -./xbranchtab -./xdafreader + -./xrafreader -./xdiffev -./xdtnorm + -./xerror -./xgene -./xgptree -./xjobqueue @@ -54,7 +56,6 @@ test : $(tests) -./xpopnodetab -./xsimsched -./xstrint - -./xterm @echo "ALL UNIT TESTS WERE COMPLETED." XBINARY := xbinary.o binary.o @@ -67,7 +68,7 @@ xmisc : $(XMISC) XPOPNODETAB := xpopnodetab.o popnodetab.o misc.o popnode.o gene.o \ branchtab.o lblndx.o tokenizer.o dtnorm.o binary.o \ - parkeyval.o parstore.o + parkeyval.o parstore.o tinyexpr.o xpopnodetab : $(XPOPNODETAB) $(CC) $(CFLAGS) -o $@ $(XPOPNODETAB) $(lib) @@ -75,6 +76,10 @@ XDTNORM := xdtnorm.o dtnorm.o xdtnorm : $(XDTNORM) $(CC) $(CFLAGS) -o $@ $(XDTNORM) $(lib) +XERROR := xerror.o error.o misc.o +xerror : $(XERROR) + $(CC) $(CFLAGS) -o $@ $(XERROR) $(lib) + XJOBQUEUE := xjobqueue.o jobqueue.o misc.o xjobqueue : $(XJOBQUEUE) $(CC) $(CFLAGS) -o $@ $(XJOBQUEUE) $(lib) @@ -89,22 +94,20 @@ XDAFREADER := xdafreader.o dafreader.o misc.o lblndx.o tokenizer.o binary.o \ xdafreader : $(XDAFREADER) $(CC) $(CFLAGS) -o $@ $(XDAFREADER) $(lib) +XRAFREADER := xrafreader.o rafreader.o misc.o lblndx.o tokenizer.o binary.o \ + parkeyval.o strint.o error.o +xrafreader : $(XRAFREADER) + $(CC) $(CFLAGS) -o $@ $(XRAFREADER) $(lib) + xparse.o : parse.c $(CC) $(CFLAGS) -c -DTEST -o $@ ../src/parse.c XPARSE := xparse.o popnodetab.o misc.o tokenizer.o gptree.o lblndx.o \ branchtab.o parstore.o parkeyval.o popnode.o binary.o gene.o \ - dprintf.o dtnorm.o + dprintf.o dtnorm.o tinyexpr.o xparse : $(XPARSE) $(CC) $(CFLAGS) -o $@ $(XPARSE) $(lib) -xterm.o : parstore.c - $(CC) $(CFLAGS) -c -DTEST -o $@ ../src/parstore.c - -XTERM := xterm.o misc.o parkeyval.o dtnorm.o -xterm : $(XTERM) - $(CC) $(CFLAGS) -o $@ $(XTERM) $(lib) - xgene.o : gene.c $(CC) $(CFLAGS) -c -DTEST -o $@ ../src/gene.c @@ -120,7 +123,7 @@ xpopnode.o : popnode.c $(CC) $(CFLAGS) -c -DTEST -o $@ ../src/popnode.c XPOPNODE := xpopnode.o misc.o gene.o branchtab.o binary.o lblndx.o \ - tokenizer.o parkeyval.o dtnorm.o parstore.o + tokenizer.o parkeyval.o dtnorm.o parstore.o tinyexpr.o xpopnode : $(XPOPNODE) $(CC) $(CFLAGS) -o $@ $(XPOPNODE) $(lib) @@ -131,14 +134,14 @@ xsimsched : $(XSIMSCHED) xgptree.o : gptree.c $(CC) $(CFLAGS) -c -DTEST -o $@ ../src/gptree.c -XGPTREE := xgptree.o misc.o branchtab.o parstore.o parse.o lblndx.o \ +XGPTREE := xgptree.o misc.o branchtab.o parstore.o lblndx.o \ parkeyval.o tokenizer.o popnodetab.o gene.o popnode.o binary.o \ - dprintf.o dtnorm.o + dprintf.o dtnorm.o parse.o tinyexpr.o xgptree : $(XGPTREE) $(CC) $(CFLAGS) -o $@ $(XGPTREE) $(lib) XPARSTORE := xparstore.o misc.o parstore.o parkeyval.o binary.o lblndx.o \ - dtnorm.o + dtnorm.o tinyexpr.o xparstore : $(XPARSTORE) $(CC) $(CFLAGS) -o $@ $(XPARSTORE) $(lib) @@ -165,7 +168,7 @@ xbranchtab.o : branchtab.c XBRANCHTAB := xbranchtab.o gptree.o misc.o binary.o parstore.o popnode.o \ gene.o lblndx.o parse.o parkeyval.o tokenizer.o popnodetab.o \ - dprintf.o dtnorm.o + dprintf.o dtnorm.o tinyexpr.o xbranchtab : $(XBRANCHTAB) $(CC) $(CFLAGS) -o $@ $(XBRANCHTAB) $(lib) diff --git a/test/xerror.c b/test/xerror.c new file mode 100644 index 00000000..fa3626c1 --- /dev/null +++ b/test/xerror.c @@ -0,0 +1,44 @@ +/** + * @file xerror.c + * @author Alan R. Rogers + * @brief Test error.c. + * @copyright Copyright (c) 2017, Alan R. Rogers + * . This file is released under the Internet + * Systems Consortium License, which can be found in file "LICENSE". + */ +#include "error.h" +#include "misc.h" +#include +#include +#include + +#ifdef NDEBUG +#error "Unit tests must be compiled without -DNDEBUG flag" +#endif + +int main(void) { + + char buff[100]; + + assert(0 == mystrerror_r(NO_ANCESTRAL_ALLELE, buff, sizeof buff)); + assert(0 == strcmp(buff, "No ancestral allele")); + + assert(0 == mystrerror_r(REF_MISMATCH, buff, sizeof buff)); + assert(0 == strcmp(buff, "Inconsistent REF alleles")); + + assert(0 == mystrerror_r(MULTIPLE_ALT, buff, sizeof buff)); + assert(0 == strcmp(buff, "Multiple ALT alleles")); + + assert(0 == mystrerror_r(BUFFER_OVERFLOW, buff, sizeof buff)); + assert(0 == strcmp(buff, "Buffer overflow")); + + assert(0 == mystrerror_r(BAD_RAF_INPUT, buff, sizeof buff)); + assert(0 == strcmp(buff, "Bad .raf input file")); + + assert(0 == mystrerror_r(BAD_SORT, buff, sizeof buff)); + assert(0 == strcmp(buff, "Incorrect sort")); + + unitTstResult("mystrerror_r", "OK"); + + return 0; +} diff --git a/test/xmisc.c b/test/xmisc.c index dc81fed0..1370c09f 100644 --- a/test/xmisc.c +++ b/test/xmisc.c @@ -130,5 +130,16 @@ int main(int argc, char **argv) { unitTstResult("parseDbl", "OK"); + strcpy(buff, "abcdefghijklmn"); + assert(0 == strcmp("lmn", strltrunc(buff, 3))); + + strcpy(buff, "abc"); + assert(0 == strcmp("abc", strltrunc(buff, 3))); + + strcpy(buff, "abc"); + assert(0 == strcmp("abc", strltrunc(buff, 4))); + + unitTstResult("strltrunc", "OK"); + return 0; } diff --git a/test/xparstore.c b/test/xparstore.c index 7bbca83e..461324e6 100644 --- a/test/xparstore.c +++ b/test/xparstore.c @@ -53,8 +53,24 @@ int main(int argc, char **argv) { assert(Bounds_equals(&bnd0, &bnd1)); unitTstResult("Bounds", "OK"); - // Test Constraint double x=1.0, y=1.0, z=2.0; + double *px=&x, *py=&y, *pz=&z; + double **ppx=&px, **ppy=&py, **ppz=&pz; + + assert(0 == compareDbls(px, py)); + assert(0 == compareDbls(px, px)); + assert(0 > compareDbls(px, pz)); + assert(0 < compareDbls(pz, py)); + + unitTstResult("compareDbls", "OK"); + + assert(0 == compareDblPtrs(ppx, ppy)); + assert(0 == compareDblPtrs(ppx, ppx)); + assert(0 > compareDblPtrs(ppx, ppz)); + assert(0 < compareDblPtrs(ppz, ppy)); + + unitTstResult("compareDblPtrs", "OK"); + ParKeyVal *pkv = NULL; pkv = ParKeyVal_add(pkv, "x", &x, Free); pkv = ParKeyVal_add(pkv, "y", &y, Free); @@ -62,30 +78,44 @@ int main(int argc, char **argv) { char buff[100]; strcpy(buff, "1+1*x + 2*x*y"); - Constraint *c = Constraint_new(pkv, buff); - assert(1 + 1*x + 2*x*y == Constraint_getValue(c)); - if(verbose) { - printf("Constraint formula: "); - Constraint_prFormula(c, stdout); - } double xx=1.0, yy=1.0, zz=2.0; ParKeyVal *pkv2 = NULL; pkv2 = ParKeyVal_add(pkv2, "x", &xx, Free); pkv2 = ParKeyVal_add(pkv2, "y", &yy, Free); pkv2 = ParKeyVal_add(pkv2, "z", &zz, Free); - Constraint *c2 = Constraint_dup(c, pkv2); - assert(Constraint_equals(c, c2)); - unitTstResult("Constraint", "OK"); + unitTstResult("ParKeyVal", "OK"); ParStore *ps = ParStore_new(); assert(ParStore_nFixed(ps) == 0); assert(ParStore_nFree(ps) == 0); assert(ParStore_nGaussian(ps) == 0); + ParStore_free(ps); double val, *ptr; ParamStatus pstat, pstat2; + ps = ParStore_new(); + double fixed0=99.0, fixed1=100.0; + ParStore_addFreePar(ps, x, 0.0, 100.0, "x"); + ParStore_addFreePar(ps, y, 0.0, 100.0, "y"); + ParStore_addFreePar(ps, z, 0.0, 100.0, "z"); + ParStore_addFixedPar(ps, fixed0, "fixed0"); + ParStore_addFixedPar(ps, fixed1, "fixed1"); + ParStore_addConstrainedPar(ps, "exp(x+log(y+z))", "c"); + ParStore_constrain(ps); + assert(2 == ParStore_nFixed(ps)); + assert(3 == ParStore_nFree(ps)); + assert(1 == ParStore_nConstrained(ps)); + assert(x = ParStore_getFree(ps, 0)); + assert(y = ParStore_getFree(ps, 1)); + assert(z = ParStore_getFree(ps, 2)); + assert(fixed0 = ParStore_getFixed(ps, 0)); + assert(fixed1 = ParStore_getFixed(ps, 1)); + assert(exp(x+log(y+z)) == ParStore_getConstrained(ps, 0)); + ParStore_free(ps); + + ps = ParStore_new(); val = 12.3; ParStore_addFixedPar(ps, val, "x"); ptr = ParStore_findPtr(ps, &pstat, "x"); @@ -214,8 +244,6 @@ int main(int argc, char **argv) { ParKeyVal_free(pkv); ParKeyVal_free(pkv2); gsl_rng_free(rng); - Constraint_free(c); - Constraint_free(c2); unitTstResult("ParStore", "OK"); return 0; diff --git a/test/xrafreader.c b/test/xrafreader.c new file mode 100644 index 00000000..b6229aa4 --- /dev/null +++ b/test/xrafreader.c @@ -0,0 +1,171 @@ +/** + * @file xrafreader.c + * @author Alan R. Rogers + * @brief Test rafreader.c. + * @copyright Copyright (c) 2016, Alan R. Rogers + * . This file is released under the Internet + * Systems Consortium License, which can be found in file "LICENSE". + */ + +#include "rafreader.h" +#include "misc.h" +#include "error.h" +#include +#include +#include +#include + +#ifdef NDEBUG +#error "Unit tests must be compiled without -DNDEBUG flag" +#endif + +const char *badInput = + "#chr\tpos\tref\talt\traf\n" + "10\t1\ta\tt\t5e-1\n" + "1\t1\ta\t.\t0\n" + "10\t200\tg\tc\t1e0\n" + "10\t201\tg\tc\t1e99999\n" + "10\t202\tg\tc\t1e-99999\n" "10\t203\tg\tc\tnot-a-float\n"; + +const char *tstInput[3] = { + "#chr\tpos\tref\talt\traf\n" + "1\t1\ta\t.\t0\n" + "10\t1\ta\tt\t5e-1\n" + "10\t200\tg\tc\t1e0\n", + + "#chr\tpos\tref\talt\traf\n" + "1\t1\ta\t.\t0.5\n" + "1\t2\ta\t.\t0.5\n" + "10\t1\ta\tt\t1e-1\n" + "10\t200\tg\tc\t1\n", + + "#chr\tpos\tref\talt\traf\n" + "1\t1\ta\t.\t1\n" + "10\t1\ta\tt\t0\n" + "10\t100\ta\tt\t0\n" + "10\t200\tg\tc\t0\n" +}; + +int main(int argc, char **argv) { + + int i, verbose = 0, status; + char errbuff[200]; + + switch (argc) { + case 1: + break; + case 2: + if(strncmp(argv[1], "-v", 2) != 0) { + fprintf(stderr, "usage: xrafreader [-v]\n"); + exit(1); + } + verbose = 1; + break; + default: + fprintf(stderr, "usage: xrafreader [-v]\n"); + exit(1); + } + + const char *badraf = "bad.raf"; + FILE *badfp = fopen(badraf, "w"); + assert(badfp); + fputs(badInput, badfp); + fclose(badfp); + RAFReader *rdr = RAFReader_new(badraf); + + for(i=0; i<7; ++i) { + status = RAFReader_next(rdr); + if(i==0) + assert(status==0); + else if(i==1) + assert(status==BAD_SORT); + else if(i==2) + assert(status==0); + else if(i==3) + assert(status==ERANGE); + else if(i==4) + assert(status==ERANGE); + else if(i==5) + assert(status==EINVAL); + else if(i==6) + assert(status==EOF); + }while(status != EOF); + RAFReader_free(rdr); + remove(badraf); + + const char *tst[3] = {"tst0.raf", "tst1.raf", "tst2.raf"}; + FILE * fp[3]; + for(i = 0; i < 3; ++i) { + fp[i] = fopen(tst[i], "w"); + assert(fp[i]); + fputs(tstInput[i], fp[i]); + fclose(fp[i]); + } + + RAFReader * r[3]; + if(verbose) + RAFReader_printHdr(stderr); + for(i = 0; i < 3; ++i) { + r[i] = RAFReader_new(tst[i]); + do{ + status = RAFReader_next(r[i]); + switch (status) { + case 0: + if(verbose) + RAFReader_print(r[i], stderr); + break; + case EOF: + break; + default: + mystrerror_r(status, errbuff, sizeof errbuff); + fprintf(stderr, "%s:%d: i=%d %s\n", + __FILE__,__LINE__, i, errbuff); + exit(1); + } + }while(status != EOF); + RAFReader_free(r[i]); + } + + for(i = 0; i < 3; ++i) + r[i] = RAFReader_new(tst[i]); + + i=0; + do{ + status = RAFReader_multiNext(3, r); + if(i==0) { + assert(status==0); + assert(0 == strcmp("1",RAFReader_chr(r[0]))); + assert(1UL == RAFReader_nucpos(r[0])); + assert(1.0 == RAFReader_daf(r[0])); + assert(0.5 == RAFReader_daf(r[1])); + assert(0.0 == RAFReader_daf(r[2])); + }else if(i==1) { + assert(status==0); + assert(0 == strcmp("10",RAFReader_chr(r[0]))); + assert(1UL == RAFReader_nucpos(r[0])); + assert(0.5 == RAFReader_daf(r[0])); + assert(0.1 == RAFReader_daf(r[1])); + assert(0.0 == RAFReader_daf(r[2])); + }else if(i==2) { + assert(status==NO_ANCESTRAL_ALLELE); + assert(0 == strcmp("10",RAFReader_chr(r[0]))); + assert(200UL == RAFReader_nucpos(r[0])); + }else if(i==3) { + assert(status==EOF); + }else{ + fprintf(stderr,"%s:%d: this shouldn't happen\n", + __FILE__,__LINE__); + exit(1); + } + ++i; + }while(status != EOF); + + for(i = 0; i < 3; ++i) { + RAFReader_free(r[i]); + remove(tst[i]); + } + + unitTstResult("RAFReader", "OK"); + + return 0; +}