diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..14e2f77 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,373 @@ +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. diff --git a/enso/config.py b/enso/config.py index d8557c8..e932d84 100644 --- a/enso/config.py +++ b/enso/config.py @@ -1,6 +1,9 @@ +import multiprocessing +import functools + import indicoio + from enso.mode import ModeKeys -import multiprocessing """Constants to configure the rest of Enso.""" @@ -14,130 +17,53 @@ FEATURES_DIRECTORY = "Features" # Directory for storing experiment results -EXPERIMENT_NAME = "Exp" +EXPERIMENT_NAME = "BERT-Showdown" # Name of the csv used to store results RESULTS_CSV_NAME = "Results.csv" # Datasets to featurize or run experiments on DATA = { -# "Classify/AirlineComplaints", - "Classify/AirlineNegativity", - # "Classify/AirlineSentiment", - # "Classify/BrandEmotion", - # "Classify/BrandEmotionCause", - # "Classify/ChemicalDiseaseCauses", - # "Classify/CorporateMessaging", - # "Classify/CustomerReviews", - # "Classify/DetailedEmotion", - # "Classify/Disaster", - # "Classify/DrugReviewIntent", - # "Classify/DrugReviewType", - # "Classify/Economy", - # "Classify/Emotion", - # "Classify/GlobalWarming", - # "Classify/Horror", - # "Classify/HotelReviews", - # "Classify/IMDB", - # "Classify/Irony", - # "Classify/MPQA", - # "Classify/MovieReviews", - # "Classify/NewYearsResolutions", - # "Classify/PoliticalTweetAlignment", - # "Classify/PoliticalTweetBias", - # "Classify/PoliticalTweetClassification", - # "Classify/PoliticalTweetSubjectivity", - # "Classify/PoliticalTweetTarget", - # "Classify/ReligiousTexts", - # "Classify/ShortAnswer", - # "Classify/SocialMediaDisasters", - # "Classify/Subjectivity", - # "Classify/TextSpam", - # "Classify/SST-binary" - # Seqence - # 'SequenceLabeling/Reuters-128', - # 'SequenceLabeling/bonds', - # 'SequenceLabeling/table_synth', - # 'SequenceLabeling/bonds_new', - # 'SequenceLabeling/tables', - # 'SequenceLabeling/typed_cols', - # 'SequenceLabeling/brown_all', - # 'SequenceLabeling/brown_nouns', - # 'SequenceLabeling/brown_verbs', - # 'SequenceLabeling/brown_pronouns', - # 'SequenceLabeling/brown_adverbs', - # 'RationalizedClassify/short_bank_qualified', - # 'RationalizedClassify/bank_qualified', - # 'RationalizedClassify/evidence_inference', - # 'RationalizedClassify/federal_tax', - # 'RationalizedClassify/short_federal_tax', - # 'RationalizedClassify/interest_frequency', - # 'RationalizedClassify/short_interest_frequency', - # 'RationalizedClassify/aviation', - # 'RationalizedClassify/movie_reviews', - # 'RationalizedClassify/mining' + # "SequenceLabeling/Reuters-128", + # "SequenceLabeling/bonds", + # "SequenceLabeling/bonds_new", + "SequenceLabeling/cord", + "SequenceLabeling/invoices", + # "SequenceLabeling/correspondence", + # "SequenceLabeling/d_invoices", + # "SequenceLabeling/C_data", + # "SequenceLabeling/H_data", + # "SequenceLabeling/wiki", } # Featurizers to activate FEATURIZERS = { - # "PlainTextFeaturizer", - # "TextContextFeaturizer", - # "IndicoStandard", - "SpacyGloveFeaturizer", - # "IndicoFastText", - # "IndicoSentiment", - # "IndicoElmo", - # "IndicoTopics", - # "IndicoFinance", - # "IndicoTransformer", - # "IndicoEmotion", - # "IndicoFastText", - # "SpacyCNNFeaturizer", + "PlainTextFeaturizer", } # Experiments to run EXPERIMENTS = { - # "FinetuneSequenceLabel", - # "RoBERTaSeqLab", - # "SidekickSeqLab", - # "LambertSeqLab", - # "IndicoSequenceLabel" - # "LRBaselineNonRationalized", - # "DistReweightedGloveClassifierCV", - # "RationaleInformedLRCV" - # 'DistReweightedGloveClassifierCV' - # "FinetuneSeqBaselineRationalized", - # "FinetuneClfBaselineNonRationalized", -# "LogisticRegressionCV", - "KNNCV", -# "TfidfKNN", -# "TfidfLogisticRegression", -# "KCenters", -# "TfidfKCenters" - # "SupportVectorMachineCV", + # "FinetuneRoberta", + "FinetuneAlbert" } # Metrics to compute METRICS = { - "Accuracy", - "MacroRocAuc", - # "AccuracyRationalized", - # "MacroRocAucRationalized", - # "MacroCharF1", - # "MacroCharRecall", - # "MacroCharPrecision" + "MicroCharF1", + "MicroCharRecall", + "MicroCharPrecision", + "MacroCharF1", + "MacroCharRecall", + "MacroCharPrecision", } # Test setup metadata TEST_SETUP = { - "train_sizes": [20, 40, 60, 80, 100, 150, 200, 300, 400, 500], - "n_splits": 5, - # "samplers": ['RandomRationalized'], - # "samplers": ["ImbalanceSampler"], - "samplers": ["Random"], + "train_sizes": [50, 100, 150, 200, 250, 300, 350, 400, 450, 500], + "n_splits": 3, + "samplers": ["RandomSequence"], "sampling_size": 0.2, - "resamplers": ['NoResampler'] - # "resamplers": ["RandomOverSampler"], + "resamplers": ["NoResampler"], } # Visualizations to display @@ -159,7 +85,7 @@ }, } -MODE = ModeKeys.CLASSIFY +MODE = ModeKeys.SEQUENCE N_GPUS = 1 N_CORES = 1 # multiprocessing.cpu_count() diff --git a/enso/experiment/__init__.py b/enso/experiment/__init__.py index 6125e4b..06075c3 100644 --- a/enso/experiment/__init__.py +++ b/enso/experiment/__init__.py @@ -19,7 +19,12 @@ from sklearn.model_selection import ParameterGrid from enso.sample import sample -from enso.utils import feature_set_location, BaseObject, SafeStratifiedShuffleSplit, RationalizedStratifiedShuffleSplit +from enso.utils import ( + feature_set_location, + BaseObject, + SafeStratifiedShuffleSplit, + RationalizedStratifiedShuffleSplit, +) from enso.mode import ModeKeys from enso.config import ( FEATURIZERS, @@ -32,10 +37,11 @@ MODE, EXPERIMENT_NAME, RESULTS_CSV_NAME, - EXPERIMENT_PARAMS + EXPERIMENT_PARAMS, ) from enso.registry import Registry, ValidateExperiments from multiprocessing import Process +from threading import Thread POOL = ProcessPoolExecutor(N_CORES) @@ -116,14 +122,20 @@ def experiment_has_been_run(self, current_settings): return True def _run_sub_experiment( - self, experiment_cls, dataset, train, test, target, current_setting, experiment_hparams=None + self, + experiment_cls, + dataset, + train, + test, + target, + current_setting, + experiment_hparams=None, ): if experiment_hparams is None: experiment_hparams = {} experiment = experiment_cls( - Registry.get_resampler(current_setting["Resampler"]), - **experiment_hparams + Registry.get_resampler(current_setting["Resampler"]), **experiment_hparams ) name = experiment.name() internal_setting = {"Experiment": name} @@ -178,21 +190,26 @@ def _run_experiment(self, dataset_name, current_setting, experiments): if experiment_cls.__name__ not in exp_params: exp_params[experiment_cls.__name__] = {} # add the 'All' configuration to all the experiments - exp_params = {('All' if k.upper() == 'ALL' else k):v for k, v in exp_params.items()} - if 'All' in EXPERIMENT_PARAMS.keys(): + exp_params = { + ("All" if k.upper() == "ALL" else k): v for k, v in exp_params.items() + } + if "All" in EXPERIMENT_PARAMS.keys(): for k, v in exp_params.items(): - if k != 'All': - # Note that specific experiment hparam values take precedent over the 'All' values - exp_params[k] = deepcopy(EXPERIMENT_PARAMS['All']) + if k != "All": + # Note that specific experiment hparam values take precedent over the 'All' values + exp_params[k] = deepcopy(EXPERIMENT_PARAMS["All"]) exp_params[k].update(v) - del exp_params['All'] + del exp_params["All"] hparams_by_experiment = { - exp_name: list(ParameterGrid(hparams)) for exp_name, hparams in exp_params.items() + exp_name: list(ParameterGrid(hparams)) + for exp_name, hparams in exp_params.items() } # make sure all experiments share the same experiment param keys (not necessarily values) param_keys = list(list(exp_params.values())[0].keys()) - assert all(set(param_keys) == set(hparams.keys()) - for hparams in exp_params.values()) + assert all( + set(param_keys) == set(hparams.keys()) + for hparams in exp_params.values() + ) # add the experiment params to self.columns self.columns += param_keys else: @@ -211,7 +228,9 @@ def _run_experiment(self, dataset_name, current_setting, experiments): current_setting["TrainSize"], ) for experiment_cls in experiments: - for experiment_hparams in hparams_by_experiment.get(experiment_cls.__name__, [{}]): + for experiment_hparams in hparams_by_experiment.get( + experiment_cls.__name__, [{}] + ): try: # Ideally we wouldn't have to do this in a process, but at the moment # creating a process and killing the process after execution is the @@ -225,7 +244,7 @@ def _run_experiment(self, dataset_name, current_setting, experiments): "test": test, "target": target, "current_setting": current_setting, - "experiment_hparams": experiment_hparams + "experiment_hparams": experiment_hparams, }, ) p.start() @@ -240,19 +259,21 @@ def _run_experiment(self, dataset_name, current_setting, experiments): time.sleep(0.1) return results - + @staticmethod - def _get_results_row(name, internal_setting, test_score, test_key, train_score=None, train_key=None): + def _get_results_row( + name, internal_setting, test_score, test_key, train_score=None, train_key=None + ): full_setting = {"Metric": name, test_key: test_score} - # measure score on train set to help detect overfitting + # measure score on train set to help detect overfitting if train_score is not None: full_setting[train_key] = train_score - + full_setting.update(internal_setting) full_setting_df = pd.DataFrame.from_records([full_setting]) return full_setting_df - + def _measure_experiment( self, target, @@ -263,7 +284,7 @@ def _measure_experiment( test_key="Result", train_key="TrainResult", train_time=None, - pred_time=None, + pred_time=None, ): """Responsible for recording all metrics specified in config for a given experiment.""" results = pd.DataFrame(columns=self.columns) @@ -274,9 +295,12 @@ def _measure_experiment( else: train_score = None full_setting_df = self._get_results_row( - metric.name(), internal_setting, - test_score=score, test_key=test_key, - train_score=train_score, train_key=train_key + metric.name(), + internal_setting, + test_score=score, + test_key=test_key, + train_score=train_score, + train_key=train_key, ) results = results.append(full_setting_df, ignore_index=True) if train_time is not None: @@ -284,7 +308,7 @@ def _measure_experiment( name="train_time", internal_setting=internal_setting, test_score=train_time, - test_key=test_key + test_key=test_key, ) results = results.append(full_setting_df, ignore_index=True) if pred_time is not None: @@ -292,7 +316,7 @@ def _measure_experiment( name="pred_time", internal_setting=internal_setting, test_score=pred_time, - test_key=test_key + test_key=test_key, ) results = results.append(full_setting_df, ignore_index=True) return results @@ -327,22 +351,22 @@ def _split_dataset(dataset, training_size): for column in dataset.columns.values if column.startswith("Target") ] + if not target_list: + raise ValueError("No `Target` column in dataset") logging.info("Training with train set of size: %s" % training_size) - # Sklearn technically offers a train_size parameter that seems like it would be better - # Unfortunately it doesn't work as expected and locks test size to train size test_size = int(len(dataset) * TEST_SETUP["sampling_size"]) if test_size + training_size > len(dataset): logging.warning( ( - "Invalid training size provided. Training size must be less than {sample_size} of dataset size." - "the length of dataset is {ds_size}, but we have a train size of {train_size} and test {test_size}" + "Invalid training size provided. Test size with this configuration is less than the required {sample_fraction}% of dataset size." + "The length of dataset is {ds_size}, but we have a train size of {train_size} and test {test_size}" ).format( - sample_size=TEST_SETUP["sampling_size"], + sample_fraction=TEST_SETUP["sampling_size"] * 100, ds_size=len(dataset), train_size=training_size, - test_size=test_size + test_size=test_size, ) ) return @@ -353,13 +377,13 @@ def _split_dataset(dataset, training_size): ) elif MODE in [ModeKeys.RATIONALIZED]: splitter = RationalizedStratifiedShuffleSplit( - TEST_SETUP['n_splits'], test_size=test_size + TEST_SETUP["n_splits"], test_size=test_size ) elif MODE in [ModeKeys.SEQUENCE]: splitter = ShuffleSplit(TEST_SETUP["n_splits"], test_size=test_size) else: raise ValueError( - "config.MODE needs to be either ModeKeys.CLASSIFY or ModeKeys.SEQUENCE" + "config.MODE needs to be one of [ModeKeys.CLASSIFY, ModeKeys.SEQUENCE, ModeKeys.RATIONALIZED]" ) for target in target_list: @@ -496,4 +520,4 @@ def __init__(self, *args, **kwargs): from enso.experiment import tfidf from enso.experiment import knn from enso.experiment import rationalized -from enso.experiment import doc_rep \ No newline at end of file +from enso.experiment import doc_rep diff --git a/enso/experiment/finetuning.py b/enso/experiment/finetuning.py index 45ea65b..2e5650d 100644 --- a/enso/experiment/finetuning.py +++ b/enso/experiment/finetuning.py @@ -6,6 +6,8 @@ from indicoio.custom import Collection from finetune import Classifier, SequenceLabeler +from finetune.base_models.bert.model import RoBERTa +from finetune.base_models.huggingface.models import HFAlbert from enso.experiment import ClassificationExperiment from enso.config import RESULTS_DIRECTORY @@ -16,7 +18,9 @@ from enso.utils import labels_to_binary -@Registry.register_experiment(ModeKeys.CLASSIFY, requirements=[("Featurizer", "PlainTextFeaturizer")]) +@Registry.register_experiment( + ModeKeys.CLASSIFY, requirements=[("Featurizer", "PlainTextFeaturizer")] +) class Finetune(ClassificationExperiment): """ LanguageModel finetuning as an alternative to simple models trained on top of pretrained features. @@ -45,18 +49,23 @@ def cleanup(self): del self.model -@Registry.register_experiment(ModeKeys.SEQUENCE, requirements=[("Featurizer", "PlainTextFeaturizer")]) +@Registry.register_experiment( + ModeKeys.SEQUENCE, requirements=[("Featurizer", "PlainTextFeaturizer")] +) class IndicoSequenceLabel(ClassificationExperiment): - def __new__(cls, *args, **kwargs): - raise Exception("DO NOT run this at the moment.... - waiting to hear from Madison") + raise Exception( + "DO NOT run this at the moment.... - waiting to hear from Madison" + ) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.model = None def fit(self, X, y): - self.model = Collection("Enso-Sequence-Labeling-{}".format(str(hash(str(X) + str(y))))) + self.model = Collection( + "Enso-Sequence-Labeling-{}".format(str(hash(str(X) + str(y)))) + ) try: self.model.clear() except: @@ -74,14 +83,17 @@ def predict(self, X, **kwargs): num_batches = num_samples // batch_size predictions = [] for i in range(num_batches): - data = X[i * batch_size: (i + 1) * batch_size] + data = X[i * batch_size : (i + 1) * batch_size] predictions.extend(self.model.predict(data)) return predictions def cleanup(self): self.model.clear() -@Registry.register_experiment(ModeKeys.SEQUENCE, requirements=[("Featurizer", "PlainTextFeaturizer")]) + +@Registry.register_experiment( + ModeKeys.SEQUENCE, requirements=[("Featurizer", "PlainTextFeaturizer")] +) class FinetuneSequenceLabel(ClassificationExperiment): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -94,3 +106,37 @@ def fit(self, X, y): def predict(self, X, **kwargs): return self.model.predict(X) + + +@Registry.register_experiment( + ModeKeys.SEQUENCE, requirements=[("Featurizer", "PlainTextFeaturizer")] +) +class FinetuneRoberta(FinetuneSequenceLabel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.model_config = dict(val_size=0, base_model=RoBERTa, class_weights="sqrt") + self.model_config.update(kwargs) + self.model = SequenceLabeler(**self.model_config) + + +@Registry.register_experiment( + ModeKeys.SEQUENCE, requirements=[("Featurizer", "PlainTextFeaturizer")] +) +class FinetuneAlbert(FinetuneSequenceLabel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.model_config = dict( + val_size=0, + base_model=HFAlbert, + class_weights="sqrt", + n_epochs=8, + chunk_long_sequences=False, + ) + self.model_config.update(kwargs) + self.model = SequenceLabeler(**self.model_config) + + def fit(self, X, y): + self.model.fit(X, y) + + def predict(self, X, **kwargs): + return self.model.predict(X) diff --git a/enso/featurize/__init__.py b/enso/featurize/__init__.py index 85b859c..4c9bf47 100644 --- a/enso/featurize/__init__.py +++ b/enso/featurize/__init__.py @@ -33,7 +33,11 @@ def _run(self, POOL): featurizer.load() for dataset_name in DATA: dataset = self._load_dataset(dataset_name) - logging.info("Featurizing {} with {}....".format(dataset_name, featurizer.__class__.__name__)) + logging.info( + "Featurizing {} with {}....".format( + dataset_name, featurizer.__class__.__name__ + ) + ) future = POOL.submit(featurizer.generate, dataset, dataset_name) futures[future] = (featurizer, dataset_name) @@ -44,14 +48,16 @@ def _run(self, POOL): logging.info( "Completed featurization of dataset `{dataset_name}` with featurizer `{featurizer}`.".format( dataset_name=dataset_name, - featurizer=featurizer.__class__.__name__ - )) + featurizer=featurizer.__class__.__name__, + ) + ) except Exception as e: logging.exception( "Failed featurization of dataset `{dataset_name}` with featurizer `{featurizer}`.".format( dataset_name=dataset_name, - featurizer=featurizer.__class__.__name__ - )) + featurizer=featurizer.__class__.__name__, + ) + ) def run(self, n_jobs=N_CORES): """ @@ -68,15 +74,17 @@ def run(self, n_jobs=N_CORES): @staticmethod def _load_dataset(dataset_name): """Responsible for finding datasets and reading them into dataframes.""" - dataset = "Data/%s" % dataset_name # TODO Data is hard coded although seems configurable from config. + dataset = ( + "Data/%s" % dataset_name + ) # TODO Data is hard coded although seems configurable from config. if "SequenceLabeling" in dataset or "RationalizedClassify" in dataset: with open("%s.json" % dataset, "rt") as fp: return json.load(fp) elif "Classify" in dataset: df = pd.read_csv("%s.csv" % dataset) - if 'Text' not in df: + if "Text" not in df: raise ValueError("File: %s has no column 'Text'" % dataset_name) - if 'Target' not in df: + if "Target" not in df: raise ValueError("File %s has no column 'Target'" % dataset_name) return df elif "DocRep" in dataset: @@ -124,16 +132,28 @@ def generate(self, dataset, dataset_name): if type(dataset) == list: text = [d[0] for d in dataset] features = self._features_from_text(text) - new_dataset = pd.DataFrame(data={ - "Text": text, - "Targets": [d[1] for d in dataset], - "Features": features - }) - + new_dataset = pd.DataFrame( + data={ + "Text": text, + "Targets": [d[1] for d in dataset], + "Features": features, + } + ) + elif type(dataset) == dict: + text = dataset["text"] + new_dataset = pd.DataFrame( + data={ + "Text": text, + "Targets": dataset["labels"], + "Features": self._features_from_text(text), + } + ) elif type(dataset) == pd.DataFrame: features = self._features_from_text(dataset["Text"]) - new_dataset = dataset.copy() # Don't want to modify the underlying dataframe - new_dataset['Features'] = features + new_dataset = ( + dataset.copy() + ) # Don't want to modify the underlying dataframe + new_dataset["Features"] = features else: raise ValueError("Unrecognised data format!!") @@ -158,9 +178,11 @@ def _features_from_text(self, text_batches): try: features = [self.featurize(entry) for entry in text_batches] except (NotImplementedError, AttributeError): - raise NotImplementedError(""" + raise NotImplementedError( + """ Featurizers must implement the featurize_list, or the featurize method - """) + """ + ) return features def _write(self, featurized_dataset, dataset_name):