From 70522e62a3564f76407d99948b17e42426094f69 Mon Sep 17 00:00:00 2001 From: Alexander Kovrigin Date: Sat, 16 Sep 2023 22:16:36 +0200 Subject: [PATCH] bleu evaluation + fixes --- LICENSE | 202 ++++++++++++++++++++++++ README.md | 4 + notebooks/1_collect_reviews.ipynb | 72 ++------- notebooks/2_inference.ipynb | 51 ++---- notebooks/3_evaluation.ipynb | 195 ++++++++++++++++++++--- requirements.txt | 1 - utils/smooth_bleu.py | 248 ++++++++++++++++++++++++++++++ 7 files changed, 659 insertions(+), 114 deletions(-) create mode 100644 LICENSE create mode 100644 utils/smooth_bleu.py diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..04a91a3 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2023 Alexander Kovrigin + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index e69de29..6de261a 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,4 @@ +# CodeReviewer ML Performance + +![Static Badge](https://img.shields.io/badge/docs-available-orange?style=flat-square) +[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/psf/black) diff --git a/notebooks/1_collect_reviews.ipynb b/notebooks/1_collect_reviews.ipynb index 96ba53e..a6a74b4 100644 --- a/notebooks/1_collect_reviews.ipynb +++ b/notebooks/1_collect_reviews.ipynb @@ -14,25 +14,12 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "initial_id", "metadata": { - "collapsed": true, - "ExecuteTime": { - "end_time": "2023-09-13T21:16:51.561272700Z", - "start_time": "2023-09-13T21:16:50.880313100Z" - } + "collapsed": true }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\akovr\\AppData\\Local\\Temp\\ipykernel_17448\\323726258.py:5: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", - " from tqdm.autonotebook import tqdm\n" - ] - } - ], + "outputs": [], "source": [ "from getpass import getpass\n", "\n", @@ -53,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "outputs": [], "source": [ "token = getpass(\"Enter your Github Access Token: \")\n", @@ -66,11 +53,7 @@ " g = Github()" ], "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2023-09-13T21:16:58.651064100Z", - "start_time": "2023-09-13T21:16:51.561272700Z" - } + "collapsed": false }, "id": "87d1643cf1710bab" }, @@ -86,10 +69,10 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "outputs": [], "source": [ - "def collect_reviews(repo_name: str, num_comments: int = 1000, skip_author=True, allow_threads=False, save=True):\n", + "def collect_reviews(repo_name: str, num_comments: int = 1000, skip_author=True, allow_threads=False, save=True, max_length=512):\n", " \"\"\"\n", " Crawl a repo for code review data\n", " :param repo_name: Repo name in format \"owner/repo\"\n", @@ -97,6 +80,7 @@ " :param skip_author: Skip comments made by the author of the pull request\n", " :param allow_threads: Allow comments that are replies to other comments\n", " :param save: Save the data to a csv file\n", + " :param max_length: Maximum length of the diff hunk\n", " :return: Returns a pandas dataframe with columns diff_hunk, human_review, created_at\n", " \"\"\"\n", " data = []\n", @@ -115,6 +99,9 @@ " if comment.diff_hunk in hunks:\n", " # if we already have this diff hunk, skip\n", " continue\n", + " if len(comment.diff_hunk) > max_length:\n", + " # if the diff hunk is too long, skip\n", + " continue\n", " # get commit author\n", " commit_author = repo.get_git_commit(comment.commit_id).author\n", " if skip_author and comment.user == commit_author:\n", @@ -134,11 +121,7 @@ " return df" ], "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2023-09-13T21:16:58.680037100Z", - "start_time": "2023-09-13T21:16:58.601487200Z" - } + "collapsed": false }, "id": "b7df1e499a6c792b" }, @@ -161,47 +144,26 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "outputs": [], "source": [ "repos = ['microsoft/vscode', 'JetBrains/kotlin', 'transloadit/uppy']" ], "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2023-09-13T21:24:48.605227900Z", - "start_time": "2023-09-13T21:24:48.598220Z" - } + "collapsed": false }, "id": "dc6a0070723b6860" }, { "cell_type": "code", - "execution_count": 9, - "outputs": [ - { - "data": { - "text/plain": " 0%| | 0/1000 [00:00", ""), # strip "skipped" tags + (r"-\n", ""), # strip end-of-line hyphenation and join lines + (r"\n", " "), # join lines + # (r'(\d)\s+(?=\d)', r'\1'), # join digits +] +normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1] + +normalize2 = [ + ( + r"([\{-\~\[-\` -\&\(-\+\:-\@\/])", + r" \1 ", + ), # tokenize punctuation. apostrophe is missing + ( + r"([^0-9])([\.,])", + r"\1 \2 ", + ), # tokenize period and comma unless preceded by a digit + ( + r"([\.,])([^0-9])", + r" \1 \2", + ), # tokenize period and comma unless followed by a digit + (r"([0-9])(-)", r"\1 \2 "), # tokenize dash when preceded by a digit +] +normalize2 = [(re.compile(pattern), replace) for (pattern, replace) in normalize2] + + +def normalize(s): + """Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl.""" + # Added to bypass NIST-style pre-processing of hyp and ref files -- wade + if nonorm: + return s.split() + if type(s) is not str: + s = " ".join(s) + # language-independent part: + for pattern, replace in normalize1: + s = re.sub(pattern, replace, s) + s = xml.sax.saxutils.unescape(s, {""": '"'}) + # language-dependent part (assuming Western languages): + s = " %s " % s + if not preserve_case: + s = s.lower() # this might not be identical to the original + for pattern, replace in normalize2: + s = re.sub(pattern, replace, s) + return s.split() + + +def count_ngrams(words, n=4): + counts = {} + for k in range(1, n + 1): + for i in range(len(words) - k + 1): + ngram = tuple(words[i : i + k]) + counts[ngram] = counts.get(ngram, 0) + 1 + return counts + + +def cook_refs(refs, n=4): + """Takes a list of reference sentences for a single segment + and returns an object that encapsulates everything that BLEU + needs to know about them.""" + + refs = [normalize(ref) for ref in refs] + maxcounts = {} + for ref in refs: + counts = count_ngrams(ref, n) + for ngram, count in counts.items(): + maxcounts[ngram] = max(maxcounts.get(ngram, 0), count) + return ([len(ref) for ref in refs], maxcounts) + + +def cook_test(test, item, n=4): + """Takes a test sentence and returns an object that + encapsulates everything that BLEU needs to know about it.""" + (reflens, refmaxcounts) = item + test = normalize(test) + result = {} + result["testlen"] = len(test) + + # Calculate effective reference sentence length. + + if eff_ref_len == "shortest": + result["reflen"] = min(reflens) + elif eff_ref_len == "average": + result["reflen"] = float(sum(reflens)) / len(reflens) + elif eff_ref_len == "closest": + min_diff = None + for reflen in reflens: + if min_diff is None or abs(reflen - len(test)) < min_diff: + min_diff = abs(reflen - len(test)) + result["reflen"] = reflen + + result["guess"] = [max(len(test) - k + 1, 0) for k in range(1, n + 1)] + + result["correct"] = [0] * n + counts = count_ngrams(test, n) + for ngram, count in counts.items(): + result["correct"][len(ngram) - 1] += min(refmaxcounts.get(ngram, 0), count) + + return result + + +def score_cooked(allcomps, n=4, ground=0, smooth=1): + totalcomps = {"testlen": 0, "reflen": 0, "guess": [0] * n, "correct": [0] * n} + for comps in allcomps: + for key in ["testlen", "reflen"]: + totalcomps[key] += comps[key] + for key in ["guess", "correct"]: + for k in range(n): + totalcomps[key][k] += comps[key][k] + logbleu = 0.0 + all_bleus = [] + for k in range(n): + correct = totalcomps["correct"][k] + guess = totalcomps["guess"][k] + addsmooth = 0 + if smooth == 1 and k > 0: + addsmooth = 1 + logbleu += math.log(correct + addsmooth + sys.float_info.min) - math.log( + guess + addsmooth + sys.float_info.min + ) + if guess == 0: + all_bleus.append(-10000000) + else: + all_bleus.append(math.log(correct + sys.float_info.min) - math.log(guess)) + + logbleu /= float(n) + all_bleus.insert(0, logbleu) + + brevPenalty = min( + 0, 1 - float(totalcomps["reflen"] + 1) / (totalcomps["testlen"] + 1) + ) + for i in range(len(all_bleus)): + if i == 0: + all_bleus[i] += brevPenalty + all_bleus[i] = math.exp(all_bleus[i]) + return all_bleus + + +def bleu(refs, candidate, ground=0, smooth=1): + refs = cook_refs(refs) + test = cook_test(candidate, refs) + return score_cooked([test], ground=ground, smooth=smooth) + + +def splitPuncts(line): + return " ".join(re.findall(r"[\w]+|[^\s\w]", line)) + + +def bleu_fromstr(predictions, golds, rmstop=True): + predictions = [ + " ".join(nltk.wordpunct_tokenize(predictions[i])) + for i in range(len(predictions)) + ] + golds = [" ".join(nltk.wordpunct_tokenize(g)) for g in golds] + if rmstop: + pypath = os.path.dirname(os.path.realpath(__file__)) + stopwords = open(os.path.join(pypath, "stopwords.txt")).readlines() + stopwords = [stopword.strip() for stopword in stopwords] + golds = [ + " ".join([word for word in ref.split() if word not in stopwords]) + for ref in golds + ] + predictions = [ + " ".join([word for word in hyp.split() if word not in stopwords]) + for hyp in predictions + ] + predictions = [ + str(i) + "\t" + pred.replace("\t", " ") for (i, pred) in enumerate(predictions) + ] + golds = [str(i) + "\t" + gold.replace("\t", " ") for (i, gold) in enumerate(golds)] + goldMap, predictionMap = computeMaps(predictions, golds) + bleu = round(bleuFromMaps(goldMap, predictionMap)[0], 2) + return bleu + + +def computeMaps(predictions, goldfile): + predictionMap = {} + goldMap = {} + + for row in predictions: + cols = row.strip().split("\t") + if len(cols) == 1: + (rid, pred) = (cols[0], "") + else: + (rid, pred) = (cols[0], cols[1]) + predictionMap[rid] = [splitPuncts(pred.strip().lower())] + + for row in goldfile: + (rid, pred) = row.split("\t") + if rid in predictionMap: # Only insert if the id exists for the method + if rid not in goldMap: + goldMap[rid] = [] + goldMap[rid].append(splitPuncts(pred.strip().lower())) + + sys.stderr.write("Total: " + str(len(goldMap)) + "\n") + return (goldMap, predictionMap) + + +# m1 is the reference map +# m2 is the prediction map +def bleuFromMaps(m1, m2): + score = [0] * 5 + num = 0.0 + + for key in m1: + if key in m2: + bl = bleu(m1[key], m2[key][0]) + score = [score[i] + bl[i] for i in range(0, len(bl))] + num += 1 + return [s * 100.0 / num for s in score] + + +if __name__ == "__main__": + reference_file = sys.argv[1] + predictions = [] + for row in sys.stdin: + predictions.append(row) + (goldMap, predictionMap) = computeMaps(predictions, reference_file) + print(bleuFromMaps(goldMap, predictionMap)[0])