From 7773a7590008782ce88c387ee481b1e1c09becdd Mon Sep 17 00:00:00 2001 From: Felix Date: Thu, 28 Nov 2024 11:14:19 +0100 Subject: [PATCH] code to automatically create resource documentation on Hugging Face --- helpers.py | 20 ++++++++ hf_gen/README.md | 21 ++++++++ hf_gen/create_docs.py | 45 ++++++++++++++++++ sb2hf.py | 108 ++++++++++++++++++++++++++++++++++-------- 4 files changed, 174 insertions(+), 20 deletions(-) create mode 100644 helpers.py create mode 100644 hf_gen/README.md create mode 100644 hf_gen/create_docs.py diff --git a/helpers.py b/helpers.py new file mode 100644 index 0000000..0ae59ea --- /dev/null +++ b/helpers.py @@ -0,0 +1,20 @@ +import requests + +def get_bibtex_from_doi(doi: str): + try: + doi_backend = "https://api.datacite.org/dois/application/x-bibtex" + response = requests.get(f"{doi_backend}/{doi}") + except requests.exceptions.HTTPError as err: + raise SystemExit(err) + return response.content + + +def get_value(resource_info : dict, key : str): + short_description = resource_info[key] + print(short_description) + if short_description.get('eng', None): + return short_description['eng'] + elif short_description.get('swe', None): + return short_description['swe'] + else: + return None \ No newline at end of file diff --git a/hf_gen/README.md b/hf_gen/README.md new file mode 100644 index 0000000..e954ab5 --- /dev/null +++ b/hf_gen/README.md @@ -0,0 +1,21 @@ +# {{title}} + +## Dataset Description + +- **Homepage:** [The official homepage of Språkbanken](https://spraakbanken.gu.se/resurser/) +- **Repository:** {{url}} +- **Point of Contact:**[sb-info@svenska.gu.se](sb-info@svenska.gu.se) + +### Dataset Summary + +{{description}} + +### Citation Information + +``` +{{bibtex}} +``` + +# Disclaimer + +This repository has been automatically created using the [sb2hf](https://github.com/felixhultin/sb2hf/tree/main) tool. \ No newline at end of file diff --git a/hf_gen/create_docs.py b/hf_gen/create_docs.py new file mode 100644 index 0000000..0e6c692 --- /dev/null +++ b/hf_gen/create_docs.py @@ -0,0 +1,45 @@ +""" + A module to automatically generate a Hugging Face README for an SBX resource + +""" + +import codecs +import logging +import requests +import yaml + +from jinja2 import Template + +from helpers import get_value +from urlparser import URLReader + + +def write_readme(url_reader: URLReader, metadata : dict, fp : str): + TEMP_LINK = "https://ws.spraakbanken.gu.se/ws/metadata-dev/" # TODO: remove this when endpoint is in production + bibtex_query = f"{TEMP_LINK}/bibtex?resource={url_reader.resource_name}&type={metadata['type']}" + logging.info(f"Fetching bibtex from {bibtex_query}") + bibtex = requests.get(bibtex_query).json()['bibtex'] + template_variables = { + 'description': get_value(metadata, 'description'), + 'title' : get_value(metadata, 'name'), + 'bibtex': bibtex, + 'url': url_reader.url + } + with open('hf_gen/README.md', 'r') as file: + template = Template(file.read(),trim_blocks=True) + rendered_file = template.render(**template_variables) + #output the file + hf_metadata = create_hf_metadata_yaml(metadata) + output_file = codecs.open(fp, "w", "utf-8") + readme = f"{hf_metadata}\n{rendered_file}" + output_file.write(readme) + output_file.close() + +def create_hf_metadata_yaml(metadata: dict): + yaml_content = yaml.dump({ + 'language': [l['code'] for l in metadata['languages']], + 'pretty_name': get_value(metadata, 'short_description') + }, + allow_unicode=True + ) + return f"---\n{yaml_content}---" \ No newline at end of file diff --git a/sb2hf.py b/sb2hf.py index d4d9aa1..96dd903 100644 --- a/sb2hf.py +++ b/sb2hf.py @@ -1,12 +1,25 @@ """ -Takes and creates an HuggingFace repository + +sb2hf.py + + A tool to automatically upload a Språkbanken resource to the Hugging Face Hub. + """ import argparse import json import os +import logging import shutil +import requests + +import pandas as pd + +from huggingface_hub import create_repo, HfApi, login, whoami +from helpers import get_bibtex_from_doi, get_value +from hf_gen.create_docs import write_readme +from hf_gen.dataloader import load_corpus_file from urlparser import URLReader @@ -16,39 +29,94 @@ def write_repository(url_reader): if os.path.exists(output_folder): shutil.rmtree(output_folder) os.mkdir(output_folder) - url_reader.download_file(to=output_folder) - with open(f'{output_folder}/{url_reader.resource_name}_config.json', 'w') as f: - config = { - 'url': url_reader.url, - 'homepage': url_reader.url, - 'resource_name': url_reader.resource_name, - 'description': None, - 'citation': None - } - json.dump(config, f, indent=4, sort_keys=True) - shutil.copyfile('hf_gen/dataset_loading_script.py', f'{output_folder}/{url_reader.resource_name}.py') + metadata_query = f"{args.sbx_metadata_api}/metadata?resource={url_reader.resource_name}" + logging.info(f"Fetching metadata from {metadata_query}") + metadata = requests.get(metadata_query).json() + write_readme(url_reader, metadata, f'{output_folder}/README.md') + if args.upload_data: + url_reader.download_file(to=output_folder) + resource_fp = f'{output_folder}/{url_reader.bz2_local_path}' + tsv_fp = f'{output_folder}/all.tsv' + print(f"Converting {resource_fp} -> {tsv_fp}") + pd.DataFrame(load_corpus_file(resource_fp), columns = ['text', 'id']).to_csv(tsv_fp, sep='\t') + os.remove(resource_fp) + else: + with open(f'{output_folder}/{url_reader.resource_name}_config.json', 'w') as f: + desc = get_value(metadata, 'description') + config = { + 'url': url_reader.url, + 'homepage': url_reader.url, + 'resource_name': url_reader.resource_name, + 'description': desc, + 'citation': get_bibtex_from_doi(metadata['doi']).decode("utf-8") + } + json.dump(config, f, indent=4, sort_keys=True, ensure_ascii=False) + shutil.copyfile('hf_gen/dataset_loading_script.py', f'{output_folder}/{url_reader.resource_name}.py') def sb2hf(): url_reader = URLReader(args.url) - write_repository(url_reader) - print("Finished. Now you just need to commit the repository to HuggingFace.") - # TODO Create repository on HuggingFace programatically. + write_repository(url_reader) + if args.push_to_hub: + if not args.hf_token: + try: + user_info = whoami() + print(f"User is authenticated: {user_info['name']}") + except Exception: + try: + login() + except Exception: + print("Could not authenticate user.") + else: + print("Using API token for authentication.") + repo_id, repo_type = f"{args.hf_namespace}/{url_reader.resource_name}", "dataset" + create_repo( + repo_id=repo_id, + repo_type=repo_type, + private=args.hf_create_private_repo, + token=args.hf_token, + exist_ok=True, + ) + api = HfApi() + api.upload_folder( + folder_path=args.hf_output_folder, + repo_id=repo_id, + repo_type=repo_type, + ) + else: + print("Finished. Now you just need to upload the repository to HuggingFace.") if __name__ == '__main__': - parser = argparse.ArgumentParser( + parser = argparse.ArgumentParser( prog='sb2hf', description='Creates a HuggingFace repository from an existing SB resource') - parser.add_argument('url', help='URL to SB resource file (usually xml)') - parser.add_argument('--hf-output-folder', help='Where to locally save the resulting HuggingFace repository.') - parser.add_argument('--push-to-hub', help='If activated, pushes generated repository directly to hub.') + parser.add_argument('url', help='URL to resource page') + parser.add_argument('--hf-output-folder', help="Where to locally save the resulting HuggingFace repository.") + parser.add_argument('--push-to-hub', help="If activated, pushes generated repository directly to hub.", action='store_true', default=False) + parser.add_argument('--hf-namespace', help="Huggingface user or organization to push dataset to", default='sbx') + parser.add_argument('--hf-create-private-repo', help="Huggingface user or organization to push dataset to", default=True) + parser.add_argument('--hf-token', help="Huggingface User Access Token to authenticate to the Hub", default=os.environ.get('HF_TOKEN', None)) + parser.add_argument('--sbx-metadata-api', help="API back-end to fetch information about SBX resources", default="https://ws.spraakbanken.gu.se/ws") + parser.add_argument('--upload-data', + help="""If set to True, the data (XML and JSON) + are uploaded to HuggingFace and uses the automated. Otherwise a + dataset loading script is uploaded which converts the data in + real time.""" + , default=True) + parser.add_argument( '-log', + '--loglevel', + default='warning', + help='Provide logging level. Example --loglevel debug, default=warning' ) parser.add_argument( '-r', '--row-output', choices=['sentences', 'tokens'], help='Which output the rows should be in.' ) args = parser.parse_args() + logging.basicConfig( level=args.loglevel.upper() ) + if args.hf_token and args.push_to_hub is None: + parser.error("--hf-token requires --push-to-hub") if args.hf_output_folder is None: args.hf_output_folder = args.url.split('/')[-1] - test = sb2hf() \ No newline at end of file + sb2hf() \ No newline at end of file