diff --git a/github-stats-compiler.py b/github-stats-compiler.py index 6270ecb..23c2648 100644 --- a/github-stats-compiler.py +++ b/github-stats-compiler.py @@ -20,7 +20,7 @@ import urllib.request # Modules to connect to the services (including backup) -from utils import backup +from utils import backup, config_reader from repositories import docker, github, conda def parseargs(): @@ -34,106 +34,175 @@ def parseargs(): """ parser = argparse.ArgumentParser() - parser.add_argument("-u", "--user", type=str, help="User owner of the repository in github", required=True) - parser.add_argument("-r", "--repo", type=str, help="name of repository in github", required=True) - parser.add_argument("-c", "--clone_info", type=str, help="Filename to save clone info", default="clone.csv") - parser.add_argument("-v", "--views_info", type=str, help="Filename to save views info", default="visits.csv") - parser.add_argument("-d", "--download_info", type=str, help="Filename to save clone info", default="download.csv") - parser.add_argument("-docker", "--docker", type=str, help="Filename to save docker info", default="docker_stats.csv") - parser.add_argument("-du", "--docker_user", type=str, help="Username of Dockerhub", default=None) - parser.add_argument("-dr", "--docker_repo", type=str, help="Repo name in dockerhub", default=None) - parser.add_argument("-conda", "--conda", type=str, help="Filename to save conda info", default="conda_stats.csv") - parser.add_argument("-ref", "--referrals_info", type=str, help="Filename to save clone info", default="referrals.csv") - parser.add_argument("-p", "--pages_info", type=str, help="Filename to save pages info", default="pages_visit.csv") parser.add_argument("-l", "--logfile", type=str, help="Filename to save logging info", default="monitor.log") - parser.add_argument("-k", "--apikey", type=str, help="Github API key") - parser.add_argument("-b", "--backup", type=str, help="Webdav url to save the backup", default=None) - parser.add_argument("-bu", "--backup_user", type=str, help="Webdav user to save the backup", default=None) - parser.add_argument("-bp", "--backup_password", type=str, help="Webdav password to the backup", default=None) + parser.add_argument("-c", "--config", type=str, help="Config with the repositories to work with", default=None, required=True) + parser.add_argument("-d", "--debug", action="store_true", help="Show debug logging info", default=False) return parser.parse_args() -def main(): - args = parseargs() - logger = logging.getLogger("Github") # One logger per repository to make easy to know what failed - logging.basicConfig(filename=args.logfile, level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') - logger.info("="*20+" Starting execution "+"="*20) - logger.info("Connecting to GITHUB API to get clone info") - # There are a lot of errors to handle when trying to connect to the API. - # Mainly we face the problem of unauthorized of forbidden queries to the - # API. However, there might be other problems that, even unlikely - # we might face in the future: API changes or redirection of URL, +def get_github_stats(user:str, repo:str, apikey:str, save_prefix:str): + logger = logging.getLogger("github") try: - clone_info = github.connect_to_API(github.GITHUB_CLONES_API_URL, args.apikey, args.user, args.repo) + # Connect to the api, get the info and write it into a csv file + # An HTTPError is raised if could not connect. + # TODO: Show the exact error code in the log + clone_info = github.connect_to_API(github.GITHUB_CLONES_API_URL, apikey, user, repo) logger.info("Clone info retrieved") - logger.info("Saving clone info into {file}".format(file=args.clone_info)) - github.save_clone_info(clone_info, args.clone_info) + clone_file:str = save_prefix+"_clone.csv" + logger.info("Saving clone info into {file}".format(file=clone_file)) + github.save_clone_info(clone_info, clone_file) except urllib.error.HTTPError as httperror: logger.error("Could not connect to GITHUB API due to the error: {error}. If its 401 Unauthorized or 403 Forbidden, please check that the api key has push permission".format(error=httperror)) - try: logger.info("Connecting to GITHUB API to get download info") - download_info = github.get_downloads_of_release(args.user, args.repo) + download_info = github.get_downloads_of_release(user, repo) + download_file:str = save_prefix+"_downloads.csv" logger.info("Download info retrieved") try: - github.save_download_info(download_info, args.download_info) + github.save_download_info(download_info, download_file) except Exception as e: logger.error("Could connect to API but not save the data. Check disk usage or availability") print(e) except Exception: - logger.error("Could not get info from: "+github.GITHUB_API_URL.format(owner=args.user, repo=args.repo)) + logger.error("Could not get info from: "+github.GITHUB_API_URL.format(owner=user, repo=repo)) try: logger.info("Connecting to GITHUB API to get views data") - views = github.connect_to_API(github.GITHUB_TRAFFIC_VIEWS, args.apikey, args.user, args.repo) + views = github.connect_to_API(github.GITHUB_TRAFFIC_VIEWS, apikey, user, repo) + views_file:str = save_prefix+"_views.csv" logger.info("Retrieved views data") - github.save_views_info(views, args.views_info) + github.save_views_info(views, views_file) except urllib.error.HTTPError as httperror: logger.error("Could not connect to GITHUB API due to the error: {error}. If its 401 Unauthorized or 403 Forbidden, please check that the api key has push permission".format(error=httperror)) try: logger.info("Connecting to GITHUB API to get popular pages data") - pages = github.connect_to_API(github.GITHUB_POPULAR_PATHS, args.apikey, args.user, args.repo) + pages_file:str = save_prefix+"_pages.csv" + pages = github.connect_to_API(github.GITHUB_POPULAR_PATHS, apikey, user, repo) logger.info("Retrieved popular pages data") - github.save_pages_info(pages, args.pages_info) + github.save_pages_info(pages, pages_file) except urllib.error.HTTPError as httperror: logger.error("Could not connect to GITHUB API due to the error: {error}. If its 401 Unauthorized or 403 Forbidden, please check that the api key has push permission".format(error=httperror)) try: logger.info("Connecting to GITHUB API to get referrals data") - referrals = github.connect_to_API(github.GITHUB_REFFERAL_SOURCE, args.apikey, args.user, args.repo) + referrals = github.connect_to_API(github.GITHUB_REFFERAL_SOURCE, apikey, user, repo) + referrals_file:str = save_prefix+"_referrals.csv" logger.info("Saving referral info") - github.save_referral_info(referrals, args.referrals_info) + github.save_referral_info(referrals, referrals_file) except urllib.error.HTTPError as httperror: logger.error("Could not connect to GITHUB API due to the error: {error}. If its 401 Unauthorized or 403 Forbidden, please check that the api key has push permission".format(error=httperror)) + pass + +def get_docker_stats(user:str, repo:str, apikey:str, save_file:str): + logging.getLogger("Docker") try: - if (args.docker_user is None or args.docker_repo is None): - logging.getLogger("Docker") - logging.info("No Docker credentials provided: docker user or repository given") - else: - docker_stats = docker.get_docker_stats(docker.REPOSITORY_API_URL, args.docker_user, args.docker_repo) - docker.save_docker_stats(docker_stats[0], docker_stats[1], args.docker) + docker_stats = docker.get_docker_stats(apikey, user, repo) + docker.save_docker_stats(docker_stats[0], docker_stats[1], save_file) except urllib.error.HTTPError as httperror: logger_docker = logging.getLogger("Docker") logger_docker.error("Error connecting to Dockerhub: {error}".format(error=httperror)) + +def get_bioconductor_stats(package:str, savefile:str): + pass + +def get_conda_stats(owner:str, repo:str, savefile:str): + logger_conda = logging.getLogger("Conda") try: - conda_stats = conda.get_conda_stats(conda.CONDA_API, args.user, args.repo) - conda.save_conda_stats(conda_stats[0], conda_stats[1], args.conda) + conda_stats = conda.get_conda_stats(conda.CONDA_API, owner, repo) + conda.save_conda_stats([conda_stats], savefile) except urllib.error.HTTPError as httperror: - logger_conda = logging.getLogger("Conda") - logger_conda.error("Error connecting to Conda: {error}".format(error=httperror)) + logger_conda.error("Error connecting to Conda API: {error}".format(error=httperror)) + + pass + +def get_stats_for_tool(tool:dict, tool_name:str, folder:str): + # Logger for the tool + logger = logging.getLogger(tool_name) + logger.info("Starting: {}".format(tool_name)) + + for repository in tool.keys(): + logger.info("Connecting to {}".format(repository)) + match repository: + + case "github": get_github_stats(tool[repository]["owner"], tool[repository]["repo"], + tool[repository]["apikey"], os.path.join(folder, tool[repository]["savefile_prefix"])) + + case "docker": get_docker_stats(tool[repository]["owner"], tool[repository]["repo"], + tool[repository]["apikey"], os.path.join(folder, tool[repository]["savefile"])) + + case "conda": get_conda_stats(tool[repository]["owner"], + tool[repository]["repo"], + os.path.join(folder, tool[repository]["savefile"])) + case "cran": pass + + case "bioconductor": get_bioconductor_stats(tool[repository]["package"], + os.path.join(folder, tool[repository]["savefile"])) + case _: + logging.error("Repository not supported: {}".format(repository)) + pass + +def main(): + # There are a lot of errors to handle when trying to connect to the API. + # Mainly we face the problem of unauthorized of forbidden queries to the + # API. However, there might be other problems that, even unlikely + # we might face in the future: API changes or redirection of URL, + + + # Check logging level from arguments to show debug info or not + # Debug mode can be activated by using the --debug flag + # Else only shos info messages + args = parseargs() + loglevel = logging.DEBUG if args.debug else logging.INFO + logging.basicConfig(filename=args.logfile, level=loglevel, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') + + # One logger per repository to make easy to know where failed + # GSS (Github stats saver) is the one used for any message not related + # to a specific element of the code (repositories, backup, etc) + logger = logging.getLogger("GSS") + + logger.info("="*20+" Starting execution "+"="*20) + logger.debug("Debug mode activated: saving into {}".format(args.logfile)) + logger.info("Loading config file from: {path}".format(path=args.config)) + + config = config_reader.load_config(args.config) + + logger.debug("Config file loaded") + + tools_data = config["tools"] + + logger.info("{} tools to monitor".format(len(tools_data))) + logger.debug("{} tools ".format(tools_data)) + + for tool in tools_data.keys(): + get_stats_for_tool(tools_data[tool], tool, config["root_folder"]) + + logger.info("Connecting to GITHUB API to get clone info") - if (args.backup is not None): - + if (config["backup"]["activate"]): + backup_data = config["backup"] logger_backup = logging.getLogger("Backup") + # We can face several errors when making the backup + # The most usual will be the network connection error to the webdab server + # But maybe there are others try: - files:list = [args.clone_info, args.views_info, args.download_info, args.pages_info, args.docker, args.conda, args.referrals_info] - files:list = list(filter(os.path.exists, files)) + files:list = [os.path.join(config["root_folder"], file) for file in os.listdir(config["root_folder"]) if file.endswith(".csv")] logger_backup.info("Backup of {} files".format(len(files))) - tar_gz_file:str = "backup-stats-{}.tar.gz".format(datetime.datetime.today().strftime('%Y-%m-%d')) + logger_backup.debug("Files to backup: {}".format(files)) + filename:str = "backup-stats-{}.tar.gz".format(datetime.datetime.today().strftime('%Y-%m-%d')) + tar_gz_file:str = os.path.join(config["root_folder"], filename) + logger_backup.debug("Backup file: {}".format(tar_gz_file)) backup._tar_gz(files, tar_gz_file) logger_backup.info("Generated backup file: {}".format(tar_gz_file)) - backup._upload(args.backup, tar_gz_file, tar_gz_file, args.backup_user, args.backup_password) - logger_backup.info("Backup file uploaded succesfully") + status = backup._upload(backup_data["backup_url_folder"], tar_gz_file, filename, backup_data["user"], backup_data["password"]) + logger_backup.debug("Status code: {}".format(status)) + if (status == 204): + logger_backup.info("Backup file uploaded succesfully") + elif (status == 404): + logger_backup.error("Remote folder to store the backup is not found: {}".format(backup_data["backup_url_folder"])) + except urllib.error.HTTPError as neterror: logger_backup.error("Could not upload backup because of error: {}".format(neterror)) + except Exception as error: + logger_backup.error("Unhandled error: {}".format(error)) + logger_backup.error("Backup not completed.") if __name__ == "__main__": main() diff --git a/repositories/conda.py b/repositories/conda.py index e325582..5d0d302 100644 --- a/repositories/conda.py +++ b/repositories/conda.py @@ -14,12 +14,15 @@ logger = logging.getLogger("Conda") def get_conda_stats(API_url:str, owner:str, repo:str) -> (): - request = urllib.request.Request(API_url.format(owner=owner, repo=repo)) + url:str = API_url.format(owner=owner, repo=repo) + logging.info("Connecting to {}".format(url)) + request = urllib.request.Request(url) response = urllib.request.urlopen(request) json_data = json.loads(response.read()) - downloads = ((version["version"], version["ndownloads"]) for version in json_data["files"]) + downloads = [",".join([version["version"], str(version["ndownloads"])]) for version in json_data["files"]] return downloads def save_conda_stats(conda_stats:tuple, conda_file): - data = "\n".join(map(lambda x: ",".join(x))) - print(data) \ No newline at end of file + # There's something i have yet to find in list comprehension, but for now I take the first element because the list is inside another list it shouldn't be + data = "\n".join(conda_stats[0]) + print(data, file=open(conda_file, "at")) \ No newline at end of file diff --git a/repositories/cran.py b/repositories/cran.py new file mode 100644 index 0000000..cd643a8 --- /dev/null +++ b/repositories/cran.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Dec 10 11:17:49 2024 + +@author: frobledo +""" + +import logging +import urllib.request + +# Note that cranlogs only shows downloads since RStudio started tracking them in 2012 +BASE_URL: str = "https://cranlogs.r-pkg.org/badges/grand-total/{package}" +logger = logging.getLogger("Bioconductor") + +def downloads_in_cran(package): + url:str = BASE_URL.format(package=package) + with urllib.request.urlopen(url) as response: + error_code:int = response.getcode() + logging.info("Connecting to cranlogs...") + if (error_code == 200): + logging.info("Succesfully connected to fetch cran downloads for: {package}".format(package=package)) + data = response.read() + else: + logging.error("Error connecting to cranlogs to fetch downloads for: {package}".format(package=package)) \ No newline at end of file diff --git a/repositories/docker.py b/repositories/docker.py index 30d2bb9..1751a14 100644 --- a/repositories/docker.py +++ b/repositories/docker.py @@ -20,8 +20,8 @@ def connect_to_docker_API(url:str, owner:str, repo:str) -> dict: response = urllib.request.urlopen(request) return json.loads(response.read()) -def get_docker_stats(API:str, owner:str, repo:str) -> (int,int): - data:dict = connect_to_docker_API(API, owner, repo) +def get_docker_stats(API:str, owner:str, repo:str) -> tuple[int,int]: + data:dict = connect_to_docker_API(REPOSITORY_API_URL, owner, repo) pulls:int = data["pull_count"] stars:int = data["star_count"] logger.info("Pulls: {pulls}".format(pulls=pulls)) diff --git a/setup.py b/setup.py index fff09a4..e36cf7c 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,7 @@ setup( name='Github-stats-saver', + python_requires='>3.10.0', # match case in main file requires 3.10 or newer. Else, 3.5 for type hints is enough version='0.1.0', description='Recovers stats of a package for several repositories', author='Fabián Robledo', diff --git a/template.json b/template.json new file mode 100644 index 0000000..0637e5d --- /dev/null +++ b/template.json @@ -0,0 +1,54 @@ +{ + "root_folder":"", + "backup" : { + "activate": false, + "method": "webdav", + "compression": ["tar.gz"], + "user": "", + "password": "", + "backup_url_folder":"" + }, + + "tools": { + "sqanti3": { + "github": { + "owner":"", + "repo":"", + "apikey":"", + "savefile_prefix":"" + }, + + "docker" : { + "owner":"", + "repo": "", + "apikey": "", + "savefile":"" + } + }, + + "MOSim": { + + "github": { + "owner": "", + "repo": "", + "apikey":"", + "savefile_prefix":"" + }, + + "bioconductor": { + "package": "", + "savefile":"" + } + }, + + "get_homologues" : { + + "conda" : { + "owner":"", + "repo":"", + "savefile":"" + } + } + } + +} diff --git a/utils/backup.py b/utils/backup.py index e34aa80..4768098 100644 --- a/utils/backup.py +++ b/utils/backup.py @@ -36,5 +36,6 @@ def _tar_gz(files: list, filename: str) -> None: def _upload(url: str, tarfile: str, remote_name:str, user:str, password:str): files = open(tarfile, 'rb') - requests.put("{}/{}".format(url, remote_name), data=files, auth = HTTPBasicAuth(user, password)) + req = requests.put("{}/{}".format(url, remote_name), data=files, auth = HTTPBasicAuth(user, password)) + return req.status_code \ No newline at end of file diff --git a/utils/config_reader.py b/utils/config_reader.py new file mode 100644 index 0000000..5695345 --- /dev/null +++ b/utils/config_reader.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Dec 10 11:32:26 2024 + +@author: frobledo +""" + +import os +import json +import logging + +import collections + +logger = logging.getLogger("Config reader") + +def load_config(path:str) -> dict: + path:str = os.path.abspath(path) + logging.info("Loading config file from: {path}".format(path=path)) + config:dict = json.load(open(path)) + logging.info("Config file loaded succesfully") + return config \ No newline at end of file diff --git a/utils/stacked_barplot.R b/utils/stacked_barplot.R new file mode 100644 index 0000000..4a82720 --- /dev/null +++ b/utils/stacked_barplot.R @@ -0,0 +1,38 @@ +#! /usr/bin/env Rscript +# library +library(ggplot2) + + +downloads <- read.csv("/home/frobledo/Software/SQANTI3/christmas_stats/download.csv") +downloads_5.3 <- downloads[downloads$Version=="v5.3.0", ] +downloads_5.3$source <- "Github releases" +docker <- read.csv("/home/frobledo/Software/SQANTI3/christmas_stats/docker.csv") +docker$source <- "Docker" +clone <- read.csv("/home/frobledo/Software/SQANTI3/christmas_stats/clone.csv") +clone$source <- "Github clones" + +data <- rbind(downloads_5.3[, c(1,3,4)]) +docker_data <- docker[, c(1,2,4)] +colnames(docker_data) <- colnames(data) +data <- rbind(data, docker_data) +data$Date <- sub(" .*", "", data$Date) + +clone_data <- clone[, c(1,3,4)] +colnames(clone_data) <- colnames(data) +clone_data$Date <- sub("T.*","",clone_data$Date) +#clone_data <- clone_data[-39, ] +clone_data$Date <- format(as.Date(clone_data$Date), "%d/%m/%Y" ) +clone_data$Date <- sub("-","/",clone_data$Date) +clone_data$Date <- sub("-","/",clone_data$Date) +clone_data <- clone_data[35:length(clone_data$Date), ] +clone_data$Downloads <- cumsum(clone_data$Downloads) +data <- rbind(data, clone_data) +data$Date <-as.Date(data$Date, "%d/%m/%Y") + +# Stacked +ggplot(data, aes(fill=source, y=Downloads, x=Date)) + + geom_bar(position="stack", stat="identity") + + scale_fill_manual(values=c("#f58a53", "#fdc659", "#15918a")) + + scale_x_date(date_labels="%d %b",date_breaks ="1 day", limits = as.Date(c("2024-12-03", "2024-12-21"))) + + ylab("Cumulative Downloads") + + ggtitle("Cumulative Downloads of SQANTI3 v5.3.0")