Skip to content

Commit

Permalink
Merge pull request #14 from wfrisch/refactor_20241218
Browse files Browse the repository at this point in the history
Refactor 20241218
  • Loading branch information
wfrisch authored Dec 18, 2024
2 parents cbdff7b + 98d5590 commit 580b68b
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 65 deletions.
11 changes: 7 additions & 4 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def sparse_paths(self):
if re_source.match(p.name) and p.is_file():
yield p.relative_to(self.path)


"""
Library configuration.
Expand Down Expand Up @@ -694,7 +695,7 @@ def sparse_paths(self):
"src/w64.c",
]),
Library('libtiff', [
"libtiff/tif_dirread.c",
"libtiff/tif_dirread.c",
"contrib/pds/tif_pdsdirread.c",
"libtiff/tif_dir.c",
"libtiff/tif_jpeg.c",
Expand Down Expand Up @@ -1203,15 +1204,17 @@ def sparse_paths(self):
"botan": ["boringssl", "mbedtls"],
"fmt": ["googletest"],
"freetype": ["zlib"],
"libjxl": ["brotli", "googletest", "Little-CMS", "libjpeg-turbo", "libpng"],
"libjxl": ["brotli", "googletest", "Little-CMS", "libjpeg-turbo",
"libpng"],
"libpng": ["zlib"],
"libsndfile": ["flac"],
"lz4": ["xxHash"],
"minizip-ng": ["xz"],
"openjpeg": ["libpng", "libtiff", "Little-CMS", "zlib"],
"rapidjson": ["googletest"],
#"protobuf": ["googletest"],
"yyjson": ["rapidjson"], # embeds test cases
# "protobuf": ["googletest"],
# yyjson embeds test cases from rapidjson
"yyjson": ["rapidjson"],
"zlib": ["minizip-ng"],
"zstd": ["xxHash", "zlib"],
}
Expand Down
27 changes: 15 additions & 12 deletions git.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,13 @@
from datetime import datetime, timezone, timedelta


CommitInfo = collections.namedtuple('CommitInfo', ['commit_hash',
'commit_time',
'paths',
'commit_desc'],
defaults=(None, None,))


def is_git_repository(directory):
if not os.path.isdir(directory):
return False
Expand All @@ -23,11 +30,6 @@ class GitException(Exception):
pass


CommitInfo = collections.namedtuple('CommitInfo',
['commit_hash', 'commit_time', 'paths', 'commit_desc'],
defaults=(None,None,))


class GitRepo:
def __init__(self, repo_path):
if not is_git_repository(repo_path):
Expand Down Expand Up @@ -59,7 +61,7 @@ def commits_affecting_file_follow(self, path):
"""List of commits for a file, including renames.
Returns [(commit,path), (commit,path), ...]"""
cmdline = self.gitcmd + ['log', '--pretty=format:%H', '--name-only',
'--follow', '--diff-filter=AMR', '--', path]
'--follow', '--diff-filter=AMR', '--', path]
proc = subprocess.run(cmdline, capture_output=True, text=True)
chunks = proc.stdout.strip().split('\n\n')
return [tuple(chunk.split('\n')) for chunk in chunks]
Expand Down Expand Up @@ -116,7 +118,7 @@ def first_commit(self):
root_commits = proc.stdout.splitlines()
if len(root_commits) > 1:
# multiple root commits are uncommon,
# but they do occur, for example in
# but they do occur, for example in
# https://github.com/google/googletest
root_commits.sort(key=lambda c: self.datetime(c))
return root_commits[0]
Expand All @@ -128,9 +130,9 @@ def count_commits(self):

def all_commits_with_metadata(self, path=None, describe=False) -> list[CommitInfo]:
result = []
cmdline = self.gitcmd + ['log', '--all', '--name-only',
'--date=iso', '--diff-filter=AMR', '--ignore-submodules',
'-z']
cmdline = self.gitcmd + ['log', '--all', '--name-only', '--date=iso',
'--diff-filter=AMR', '--ignore-submodules',
'-z']
if describe:
cmdline += ['--format=format:%(describe:tags) %H %ad']
else:
Expand Down Expand Up @@ -164,11 +166,12 @@ def all_commits_with_metadata(self, path=None, describe=False) -> list[CommitInf
match = re_line0.match(lines[0])
if not match:
raise ValueError(f"ACWM({path}) unexpected line0: " + lines[0])
commit_desc = match.group('desc') # can be None
commit_desc = match.group('desc') # can be None
commit_hash = match.group('hash')
commit_time = datetime.fromisoformat(match.group('date'))
paths = list(filter(lambda line: len(line) > 0, lines[1:]))
result.append(CommitInfo(commit_hash, commit_time, paths, commit_desc))
result.append(CommitInfo(commit_hash, commit_time, paths,
commit_desc))
return result

# vim:set expandtab tabstop=4 shiftwidth=4 softtabstop=4 nowrap:
90 changes: 49 additions & 41 deletions index.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
from pathlib import Path
import argparse
import collections
import concurrent.futures
Expand All @@ -13,9 +12,14 @@
import config


def libpath(lib):
return Path("libraries") / lib.name

# Types
FileRecord = collections.namedtuple('FileRecord', ['sha256',
'library',
'commit_hash',
'commit_time',
'commit_desc',
'path',
'size', ])

SCHEMA = '''
CREATE TABLE IF NOT EXISTS files (
Expand All @@ -38,17 +42,8 @@ def libpath(lib):
);
'''

SourceInfo = collections.namedtuple(
'SourceInfo', ['sha256',
'library',
'commit_hash',
'commit_time',
'commit_desc',
'path',
'size',
])


# CLI
parser = argparse.ArgumentParser()
parser.add_argument("-d", help="database path. Default: ./idlib.sqlite",
dest="db", default='idlib.sqlite')
Expand All @@ -71,14 +66,16 @@ def libpath(lib):
else:
libraries = config.libraries


# Sanity check
if len(libraries) == 0:
print("No libraries found.", file=sys.stderr)
sys.exit(1)

for lib in libraries:
print(f"Checking configuration for {lib.name:15s} ", end='')
try:
git = GitRepo(libpath(lib))
git = GitRepo(lib.path)
except ValueError as e:
print(e)
sys.exit(1)
Expand All @@ -89,11 +86,14 @@ def libpath(lib):
print()


# Setup database
con = sqlite3.connect(args.db)
cur = con.executescript(SCHEMA)
git = None # set by the initializer of each forked worker process


def get_sourceinfos(lib_name, commitinfo):
# Functions
def get_filerecords(lib_name, commitinfo):
global git
result = []
commit_hash, commit_time, paths, _ = commitinfo
Expand All @@ -106,7 +106,7 @@ def get_sourceinfos(lib_name, commitinfo):
m = hashlib.sha256()
m.update(blob)
sha256 = m.hexdigest()
result.append(SourceInfo(sha256=sha256,
result.append(FileRecord(sha256=sha256,
library=lib_name,
commit_hash=commit_hash,
commit_time=commit_time,
Expand All @@ -119,18 +119,17 @@ def get_sourceinfos(lib_name, commitinfo):
return result


def process_init(repo_path):
global git
git = GitRepo(repo_path)
def get_all_filerecords(repo_path, lib_name, commitinfos, max_workers):
result = []

def process_init(repo_path):
global git
git = GitRepo(repo_path)

def get_all_sourceinfos_parallel(repo_path, lib_name, commitinfos,
max_workers):
result = []
with ProcessPoolExecutor(max_workers=max_workers,
initializer=process_init,
initargs=(repo_path,)) as executor:
futures = [executor.submit(get_sourceinfos, lib_name, ci) for ci in
futures = [executor.submit(get_filerecords, lib_name, ci) for ci in
commitinfos]
for future in concurrent.futures.as_completed(futures):
result += future.result()
Expand All @@ -141,20 +140,19 @@ def index_full(max_workers):
for lib in libraries:
print(f"Indexing library: {lib.name}")
sys.stdout.flush()
git = GitRepo(libpath(lib))
git = GitRepo(lib.path)
print("- fetching list of all commits")
commitinfos = git.all_commits_with_metadata()
num_files = 0
for ci in commitinfos:
num_files += len(ci.paths)
print(f"- found {num_files} files in {len(commitinfos)} commits")
sourceinfos = get_all_sourceinfos_parallel(libpath(lib), lib.name,
commitinfos,
max_workers=max_workers)
filerecords = get_all_filerecords(lib.path, lib.name, commitinfos,
max_workers=max_workers)
cur = con.cursor()
cur.execute('DELETE FROM files WHERE library = ?', (lib.name,))
for info in sourceinfos:
cur.execute('''INSERT INTO files VALUES (?,?,?,?,?,?,?)''', info)
cur.executemany('''INSERT INTO files VALUES (?,?,?,?,?,?,?)''',
filerecords)
con.commit()
print()
sys.stdout.flush()
Expand All @@ -164,20 +162,19 @@ def index_sparse(max_workers):
for lib in libraries:
print(f"Indexing library: {lib.name}")
sys.stdout.flush()
git = GitRepo(libpath(lib))
sourceinfos = []
git = GitRepo(lib.path)
filerecords = []
for p in lib.sparse_paths:
commitinfos = git.all_commits_with_metadata(path=p)
print(f"- found {len(commitinfos)} versions of {p}")
sys.stdout.flush()
sourceinfos += get_all_sourceinfos_parallel(libpath(lib), lib.name,
commitinfos,
max_workers=max_workers)
print(f"- total {len(sourceinfos)} files")
filerecords += get_all_filerecords(lib.path, lib.name, commitinfos,
max_workers=max_workers)
print(f"- total {len(filerecords)} files")
cur = con.cursor()
cur.execute('DELETE FROM files WHERE library = ?', (lib.name,))
for info in sourceinfos:
cur.execute('''INSERT INTO files VALUES (?,?,?,?,?,?,?)''', info)
cur.executemany('''INSERT INTO files VALUES (?,?,?,?,?,?,?)''',
filerecords)
con.commit()
print()
sys.stdout.flush()
Expand All @@ -197,19 +194,29 @@ def prune():
for b_lib in b_libs:
print(f" - {a_lib} -= {b_lib}")
to_delete = []
rows = cur.execute('''SELECT a.library, a.sha256, a.path FROM files a JOIN files b ON a.sha256 = b.sha256 WHERE a.library = ? AND b.library = ?;''', (a_lib, b_lib))
rows = cur.execute(
"SELECT a.library, a.sha256, a.path FROM files a "
"JOIN files b ON a.sha256 = b.sha256 "
"WHERE a.library = ? AND b.library = ?;",
(a_lib, b_lib))
for row in rows:
lib, sha256, path = row
to_delete.append((sha256, lib))
print(f" - delete in {a_lib}: {sha256} {path}")
for sha256, lib in to_delete:
cur.execute("DELETE FROM files WHERE sha256 = ? AND library = ?", (sha256, lib))
cur.execute("DELETE FROM files"
"WHERE sha256 = ? AND library = ?", (sha256, lib))
con.commit()

print("- delete remaining duplicates: (check this list carefully)")
cur = con.cursor()
to_delete = []
rows = cur.execute('''SELECT a.library,b.library,a.sha256,a.path FROM files a JOIN (SELECT sha256,library,path,COUNT(*) c FROM files GROUP BY sha256 HAVING c > 1) b ON a.sha256 = b.sha256 WHERE a.library != b.library AND a.size > 0 ORDER BY a.library DESC;''')
rows = cur.execute(
"SELECT a.library, b.library, a.sha256, a.path FROM files a "
"JOIN (SELECT sha256,library,path,COUNT(*) c FROM files "
"GROUP BY sha256 HAVING c > 1) b ON a.sha256 = b.sha256 "
"WHERE a.library != b.library AND a.size > 0 "
"ORDER BY a.library DESC;")
for row in rows:
a_lib, b_lib, sha256, a_path = row
to_delete.append(sha256)
Expand All @@ -224,6 +231,7 @@ def prune():
con.commit()


# Main
if not args.prune_only:
if args.mode == 'sparse':
index_sparse(args.max_workers)
Expand Down
19 changes: 11 additions & 8 deletions metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,19 @@
from concurrent.futures import ThreadPoolExecutor
import argparse
import collections
import json
from pathlib import Path
import re
import sys

from git import GitRepo


# Types
Candidate = collections.namedtuple('Candidate',
'path,score,time_cov,commit_cov')


# CLI
parser = argparse.ArgumentParser(
prog="metric.py",
description="helps with the file selection for new libraries")
Expand All @@ -81,23 +86,25 @@
help="limit the number of results. default: 20")
args = parser.parse_args()


# Sanity check
repo_path = Path(args.repo_path)
assert repo_path.is_dir()
git = GitRepo(repo_path)


Candidate = collections.namedtuple('Candidate', 'path,score,time_cov,commit_cov')


# Main
all_commitinfos = git.all_commits_with_metadata()


def num_commits_in_timespan(git, start, end):
cnt = 0
for ci in all_commitinfos:
if ci.commit_time >= start and ci.commit_time <= end:
cnt += 1
return cnt


def evaluate(git, path) -> float:
print("eval:", path)
sys.stdout.flush()
Expand Down Expand Up @@ -159,10 +166,6 @@ def evaluate_all(git, paths):

print()
print("Suggested config:")

#paths = [s.path for c in candidates[:args.limit]]
#print(json.dumps(paths, indent=4))

print(' [')
for c in candidates[:args.limit]:
print(f' "{c.path}",')
Expand Down

0 comments on commit 580b68b

Please sign in to comment.