Skip to content

Commit

Permalink
refactor: use logging instead of print statements (ksamuk#97)
Browse files Browse the repository at this point in the history
Closes ksamuk#12.

This PR swaps prints for logging and also uses f-strings where
appropriate. I waffled on whether or not to remove the `[pixy]` label
from the messages.

Log snippet output:
```
$ pixy --stats pi fst dxy --vcf /Users/erin/git/client/pixy/tests/main/data/ag1000_pixy_test.vcf.gz --populations /Users/erin/git/client/pixy/tests/main/data/ag1000_populations_file.txt  --bypass_invariant_check no --window_size 10000  --output_folder /Users/erin/scratch/billable/pixy/version_testing_bypass_invar_check/1_2_5_beta1_reinstall/ 
2025-02-13 16:03:55,055 pixy.__main__:main:261 [INFO]: [pixy] pixy 1.2.10.beta2
2025-02-13 16:03:55,056 pixy.__main__:main:262 [INFO]: [pixy] See documentation at https://pixy.readthedocs.io/en/latest/
2025-02-13 16:03:55,066 pixy.args_validation:check_and_validate_args:477 [INFO]: Validating VCF and input parameters...
2025-02-13 16:03:55,066 pixy.args_validation:check_and_validate_args:480 [INFO]: Checking write access...
2025-02-13 16:03:55,066 pixy.args_validation:check_and_validate_args:487 [INFO]: Checking CPU configuration...
2025-02-13 16:03:55,066 pixy.args_validation:check_and_validate_args:510 [INFO]: Checking for invariant sites...
2025-02-13 16:03:56,408 pixy.args_validation:check_and_validate_args:547 [INFO]: Checking chromosome data...
2025-02-13 16:03:56,441 pixy.args_validation:check_and_validate_args:555 [INFO]: Checking intervals/sites...
2025-02-13 16:03:56,469 pixy.args_validation:check_and_validate_args:560 [INFO]: OK
2025-02-13 16:03:56,469 pixy.args_validation:check_and_validate_args:605 [INFO]: Checking sample data...
2025-02-13 16:03:56,475 pixy.args_validation:check_and_validate_args:647 [INFO]: All initial checks passed!
2025-02-13 16:03:56,476 pixy.__main__:main:272 [INFO]: [pixy] Preparing for calculation of summary statistics: pi, fst, dxy
2025-02-13 16:03:56,476 pixy.__main__:main:282 [INFO]: [pixy] Using Weir and Cockerham (1984)'s estimator of FST.
2025-02-13 16:03:56,476 pixy.__main__:main:284 [INFO]: [pixy] Data set contains 2 populations, 2 chromosome(s), and 36 sample(s)
2025-02-13 16:03:56,476 pixy.__main__:main:291 [INFO]: [pixy] Window size: 10000 bp
2025-02-13 16:03:56,476 pixy.__main__:main:301 [INFO]: Started calculations at         16:03:56 on 2025-02-13
2025-02-13 16:03:56,476 pixy.__main__:main:305 [INFO]: [pixy] Using 1 out of 14 available CPU cores
2025-02-13 16:03:56,476 pixy.__main__:main:353 [INFO]: [pixy] Processing chromosome/contig X
2025-02-13 16:03:56,688 pixy.__main__:main:394 [INFO]: [pixy] Calculating statistics for region X:1-54321
2025-02-13 16:03:58,836 pixy.__main__:main:353 [INFO]: [pixy] Processing chromosome/contig 1
2025-02-13 16:03:59,054 pixy.__main__:main:394 [INFO]: [pixy] Calculating statistics for region 1:1-54321
2025-02-13 16:04:01,224 pixy.__main__:main:840 [INFO]: [pixy] All calculations complete at 16:04:01 on 2025-02-13
2025-02-13 16:04:01,225 pixy.__main__:main:845 [INFO]: [pixy] Time elapsed: 00:00:04
2025-02-13 16:04:01,225 pixy.__main__:main:846 [INFO]: [pixy] Output files written to /Users/erin/scratch/billable/pixy/version_testing_bypass_invar_check/1_2_5_beta1_reinstall
2025-02-13 16:04:01,225 pixy.__main__:main:855 [INFO]: [pixy] If you use pixy in your research, please cite the following paper: Korunes, KL and K Samuk. pixy: Unbiased estimation of nucleotide diversity and divergence in the presence of missing data. Mol Ecol Resour. 2021 Jan 16. doi: 10.1111/1755-0998.13326.
```
  • Loading branch information
emmcauley authored Feb 21, 2025
1 parent a48abb8 commit 042c813
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 113 deletions.
88 changes: 37 additions & 51 deletions pixy/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def main() -> None: # noqa: C901
level=logging.INFO,
format="%(asctime)s %(name)s:%(funcName)s:%(lineno)s [%(levelname)s]: %(message)s",
)

logger = logging.getLogger(__name__)
# initialize arguments
parser = argparse.ArgumentParser(
description=help_image + help_text + "\n" + version,
Expand Down Expand Up @@ -258,8 +258,8 @@ def main() -> None: # noqa: C901

# validate arguments with the check_and_validate_args fuction
# returns parsed populaion, chromosome, and sample info
print("[pixy] pixy " + version)
print("[pixy] See documentation at https://pixy.readthedocs.io/en/latest/")
logger.info(f"[pixy] pixy {version}")
logger.info("[pixy] See documentation at https://pixy.readthedocs.io/en/latest/")

pixy_args: PixyArgs = pixy.args_validation.check_and_validate_args(args)
popindices = {}
Expand All @@ -269,52 +269,40 @@ def main() -> None: # noqa: C901
].callset_index.values
chrom_list = pixy_args.chromosomes

print(
"\n[pixy] Preparing for calculation of summary statistics: "
+ ", ".join(map(str, pixy_args.stats))
logger.info(
f"[pixy] Preparing for calculation of summary statistics: {', '.join(map(str, args.stats))}"
)

fst_cite: str
if PixyStat.FST in pixy_args.stats:
if pixy_args.fst_type is FSTEstimator.WC:
fst_cite = "Weir and Cockerham (1984)"
elif pixy_args.fst_type is FSTEstimator.HUDSON:
fst_cite = "Hudson (1992)"
print("[pixy] Using " + fst_cite + "'s estimator of FST.")

print(
"[pixy] Data set contains "
+ str(len(pixy_args.pop_names))
+ " population(s), "
+ str(len(chrom_list))
+ " chromosome(s), and "
+ str(len(pixy_args.pop_ids))
+ " sample(s)"
logger.info(f"[pixy] Using {fst_cite}'s estimator of FST.")

logger.info(
f"[pixy] Data set contains {len(pixy_args.pop_names)} populations, "
f"{len(chrom_list)} chromosome(s), "
f"and {len(pixy_args.pop_ids)} sample(s)"
)

if pixy_args.window_size is not None:
print("[pixy] Window size: " + str(args.window_size) + " bp")
logger.info(f"[pixy] Window size: {pixy_args.window_size} bp")

if args.bed_file is not None:
print("[pixy] Windows sourced from: " + args.bed_file)
logger.info(f"[pixy] Windows sourced from: {args.bed_file}")

if args.sites_file is not None:
print("[pixy] Calculations restricted to sites in: " + args.sites_file)

print("")
logger.info(f"[pixy] Calculations restricted to sites in {args.sites_file}")

# time the calculations
start_time = time.time()
print(
"[pixy] Started calculations at "
+ time.strftime("%H:%M:%S on %Y-%m-%d", time.localtime(start_time))
)
print(
"[pixy] Using "
+ str(pixy_args.num_cores)
+ " out of "
+ str(mp.cpu_count())
+ " available CPU cores\n"
logger.info(
f"Started calculations at \
{time.strftime('%H:%M:%S on %Y-%m-%d', time.localtime(start_time))}"
)
logger.info(f"[pixy] Using {pixy_args.num_cores} out of {mp.cpu_count()} available CPU cores")
# if in mc mode, set up multiprocessing
if pixy_args.num_cores > 1:
# use forking context on linux, and spawn otherwise (macOS)
Expand Down Expand Up @@ -362,7 +350,7 @@ def listener(q: Queue, temp_file: str) -> None:
# begin processing each chromosome

for chromosome in pixy_args.chromosomes:
print("[pixy] Processing chromosome/contig " + chromosome + "...")
logger.info(f"[pixy] Processing chromosome/contig {chromosome}")

# if not using a bed file, build windows manually
if pixy_args.bed_df is None:
Expand Down Expand Up @@ -403,7 +391,7 @@ def listener(q: Queue, temp_file: str) -> None:

targ_region = chromosome + ":" + str(interval_start) + "-" + str(interval_end)

print("[pixy] Calculating statistics for region " + targ_region + "...")
logger.info(f"[pixy] Calculating statistics for region {targ_region}")

# Determine list of windows over which to compute stats
# in the case were window size = 1, AND there is a sites file, use the sites file as the
Expand Down Expand Up @@ -587,11 +575,10 @@ def listener(q: Queue, temp_file: str) -> None:

if set(pixy_args.stats) != set(successful_stats):
missing_stats = list(set(pixy_args.stats) - set(successful_stats))
print(
"\n[pixy] WARNING: pixy failed to find any valid gentoype data to calculate the "
"following summary statistics: "
+ ", ".join([str(stat) for stat in missing_stats])
+ ". No output file will be created for these statistics."
logger.warning(
"[pixy] WARNING: pixy failed to find any valid genotype data to calculate the "
f"following summary statistics {', '.join([str(stat) for stat in missing_stats])}."
" No output file will be created for these statistics."
)

outpanel[3] = outpanel[3].astype(str) # force chromosome IDs to string
Expand Down Expand Up @@ -819,8 +806,8 @@ def listener(q: Queue, temp_file: str) -> None:
outfile.close()

if len(chroms_with_no_data) >= 1:
print(
"\n[pixy] NOTE: The following chromosomes/scaffolds did not have sufficient data "
logger.info(
"[pixy] NOTE: The following chromosomes/scaffolds did not have sufficient data "
f"to estimate FST: {', '.join(chroms_with_no_data)}"
)

Expand All @@ -842,32 +829,31 @@ def listener(q: Queue, temp_file: str) -> None:
leftover_tmp_files = list(filter(r.match, outfolder_files))

if len(output_files) == 0:
print(
"\n[pixy] WARNING: pixy failed to write any output files. Your VCF may not contain "
logger.warning(
"[pixy] WARNING: pixy failed to write any output files. Your VCF may not contain "
"valid genotype data, or it was removed via filtering using the specified sites/bed "
"file (if any)."
)

# print completion message
end_time = time.time()
print(
"\n[pixy] All calculations complete at "
logger.info(
"[pixy] All calculations complete at "
+ time.strftime("%H:%M:%S on %Y-%m-%d", time.localtime(end_time))
)
total_time = time.time() - start_time
print("[pixy] Time elapsed: " + time.strftime("%H:%M:%S", time.gmtime(total_time)))
print("[pixy] Output files written to: " + str(pixy_args.output_dir))
logger.info("[pixy] Time elapsed: " + time.strftime("%H:%M:%S", time.gmtime(total_time)))
logger.info(f"[pixy] Output files written to {pixy_args.output_dir}")

if len(leftover_tmp_files) > 0:
print("\n[pixy] NOTE: There are pixy temp files in " + str(pixy_args.output_dir))
print(
logger.info(
f"[pixy] NOTE: There are pixy temp files in {pixy_args.output_dir}."
"[pixy] If these are not actively being used (e.g. by another running pixy process), "
"they can be safely deleted."
)

print(
"\n[pixy] If you use pixy in your research, please cite the following paper:\n[pixy] "
+ citation
logger.info(
f"[pixy] If you use pixy in your research, please cite the following paper: {citation}"
)

# restore output
Expand Down
80 changes: 21 additions & 59 deletions pixy/core.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
import logging
import os
import shutil
import subprocess
Expand Down Expand Up @@ -614,6 +615,7 @@ def check_and_validate_args( # noqa: C901
"""
# CHECK FOR TABIX
logger = logging.getLogger(__name__)
tabix_path = shutil.which("tabix")

if tabix_path is None:
Expand Down Expand Up @@ -649,11 +651,10 @@ def check_and_validate_args( # noqa: C901
# get vcf header info
vcf_headers = allel.read_vcf_headers(vcf_path)

print("\n[pixy] Validating VCF and input parameters...")
logger.info("\n[pixy] Validating VCF and input parameters...")

# CHECK OUTPUT FOLDER
print("[pixy] Checking write access...", end="")
check_message = "OK"
logger.info("[pixy] Checking write access...")

output_folder, output_prefix = validate_output_path(
output_folder=args.output_folder, output_prefix=args.output_prefix
Expand All @@ -665,30 +666,16 @@ def check_and_validate_args( # noqa: C901
pass # file is created and then closed
assert os.access(temp_file, os.W_OK), "temp file is not writable"

if check_message == "OK":
print(check_message)

# CHECK CPU CONFIGURATION
print("[pixy] Checking CPU configuration...", end="")
check_message = "OK"
logger.info("[pixy] Checking CPU configuration...")

if args.n_cores > mp.cpu_count():
check_message = "WARNING"
print(check_message)
print(
"[pixy] WARNING: "
+ str(args.n_cores)
+ " CPU cores requested but only "
+ str(mp.cpu_count())
+ " are available. Using "
+ str(mp.cpu_count())
+ "."
logger.warning(
f"[pixy] WARNING: {args.n_cores} CPU cores requested but only "
f"{mp.cpu_count()} are available. Using {mp.cpu_count()}."
)
args.n_cores = mp.cpu_count()

if check_message == "OK":
print(check_message)

# CHECK FOR EXISTENCE OF INPUT FILES

if args.bed_file is not None:
Expand All @@ -702,8 +689,7 @@ def check_and_validate_args( # noqa: C901

# check if the vcf contains any invariant sites
# a very basic check: just looks for at least one invariant site in the alt field
print("[pixy] Checking for invariant sites...", end="")
check_message = "OK"
logger.info("[pixy] Checking for invariant sites...")

if args.bypass_invariant_check == "no":
alt_list = (
Expand All @@ -723,52 +709,41 @@ def check_and_validate_args( # noqa: C901
"This check can be bypassed via --bypass_invariant_check 'yes'."
)
if "." in alt_list and len(alt_list) == 1:
check_message = "WARNING"
print(
logger.warning(
"[pixy] WARNING: the provided VCF appears to contain no variable sites in the "
"first 100 000 sites. It may have been filtered incorrectly, or genetic diversity "
"may be extremely low. "
"This warning can be suppressed via --bypass_invariant_check 'yes'.'"
)
else:
if not (len(args.stats) == 1 and (args.stats[0] == "fst")):
check_message = "WARNING"
print(check_message)
print(
logger.warning(
"[pixy] EXTREME WARNING: --bypass_invariant_check is set to 'yes'. Note that a "
"lack of invariant sites will result in incorrect estimates."
)

if check_message == "OK":
print(check_message)

# check if requested chromosomes exist in vcf
# parses the whole CHROM column (!)

print("[pixy] Checking chromosome data...", end="")
logger.info("[pixy] Checking chromosome data...")

chrom_list: List[str] = get_chrom_list(args)

print("OK")

# INTERVALS
# check if intervals are correctly specified
# validate the BED file (if present)

print("[pixy] Checking intervals/sites...", end="")
check_message = "OK"
logger.info("[pixy] Checking intervals/sites...")

if args.bed_file is None:
check_message = validate_window_and_interval_args(args)

logger.info(f"{check_message}")
else:
if (
args.interval_start is not None
or args.interval_end is not None
or args.window_size is not None
):
check_message = "ERROR"
print(check_message)
raise Exception(
"[pixy] ERROR: --interval_start, --interval_end, and --window_size are not valid "
"when a BED file of windows is provided."
Expand All @@ -779,9 +754,7 @@ def check_and_validate_args( # noqa: C901
chrom_list = list(set(chrom_list) & set(bed_chrom))

if len(missing) > 0:
check_message = "WARNING"
print(check_message)
print(
logger.warning(
"[pixy] WARNING: the following chromosomes in the BED file do not occur in the VCF "
f"and will be ignored: {missing}"
)
Expand All @@ -801,20 +774,15 @@ def check_and_validate_args( # noqa: C901
missing_sites = list(set(chrom_sites) - set(chrom_list))

if len(missing_sites) > 0:
check_message = "WARNING"
print(check_message)
print(
logger.warning(
"[pixy] WARNING: the following chromosomes in the sites file do not occur in the "
f"VCF and will be ignored: {missing_sites}"
)

if check_message == "OK":
print(check_message)

# SAMPLES
# check if requested samples exist in vcf

print("[pixy] Checking sample data...", end="")
logger.info("[pixy] Checking sample data...")

# - parse + validate the population file
# - format is IND POP (tab separated)
Expand All @@ -832,12 +800,9 @@ def check_and_validate_args( # noqa: C901
try:
samples_callset_index = [samples_list.index(s) for s in populations_df["ID"]]
except ValueError as e:
check_message = "ERROR"
print(check_message)
raise Exception(
"[pixy] ERROR: the following samples are listed in the population file but not in the "
"VCF: ",
missing,
"[pixy] ERROR: the following samples are listed in the population file "
f"but not in the VCF: {missing}"
) from e
else:
populations_df["callset_index"] = samples_callset_index
Expand All @@ -851,15 +816,12 @@ def check_and_validate_args( # noqa: C901
].callset_index.values

if len(popnames) == 1 and ("fst" in args.stats or "dxy" in args.stats):
check_message = "ERROR"
print(check_message)
raise Exception(
"[pixy] ERROR: calcuation of fst and/or dxy requires at least two populations to be "
"[pixy] ERROR: calculation of fst and/or dxy requires at least two populations to be "
"defined in the population file."
)

print("OK")
print("[pixy] All initial checks past!")
logger.info("[pixy] All initial checks passed!")

return (
popnames,
Expand Down
Loading

0 comments on commit 042c813

Please sign in to comment.