Skip to content

Commit

Permalink
code to automatically create resource documentation on Hugging Face
Browse files Browse the repository at this point in the history
  • Loading branch information
felixhultin committed Nov 28, 2024
1 parent 8bb43e0 commit 7773a75
Show file tree
Hide file tree
Showing 4 changed files with 174 additions and 20 deletions.
20 changes: 20 additions & 0 deletions helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import requests

def get_bibtex_from_doi(doi: str):
try:
doi_backend = "https://api.datacite.org/dois/application/x-bibtex"
response = requests.get(f"{doi_backend}/{doi}")
except requests.exceptions.HTTPError as err:
raise SystemExit(err)
return response.content


def get_value(resource_info : dict, key : str):
short_description = resource_info[key]
print(short_description)
if short_description.get('eng', None):
return short_description['eng']
elif short_description.get('swe', None):
return short_description['swe']
else:
return None
21 changes: 21 additions & 0 deletions hf_gen/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# {{title}}

## Dataset Description

- **Homepage:** [The official homepage of Språkbanken](https://spraakbanken.gu.se/resurser/)
- **Repository:** {{url}}
- **Point of Contact:**[[email protected]]([email protected])

### Dataset Summary

{{description}}

### Citation Information

```
{{bibtex}}
```

# Disclaimer

This repository has been automatically created using the [sb2hf](https://github.com/felixhultin/sb2hf/tree/main) tool.
45 changes: 45 additions & 0 deletions hf_gen/create_docs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""
A module to automatically generate a Hugging Face README for an SBX resource
"""

import codecs
import logging
import requests
import yaml

from jinja2 import Template

from helpers import get_value
from urlparser import URLReader


def write_readme(url_reader: URLReader, metadata : dict, fp : str):
TEMP_LINK = "https://ws.spraakbanken.gu.se/ws/metadata-dev/" # TODO: remove this when endpoint is in production
bibtex_query = f"{TEMP_LINK}/bibtex?resource={url_reader.resource_name}&type={metadata['type']}"
logging.info(f"Fetching bibtex from {bibtex_query}")
bibtex = requests.get(bibtex_query).json()['bibtex']
template_variables = {
'description': get_value(metadata, 'description'),
'title' : get_value(metadata, 'name'),
'bibtex': bibtex,
'url': url_reader.url
}
with open('hf_gen/README.md', 'r') as file:
template = Template(file.read(),trim_blocks=True)
rendered_file = template.render(**template_variables)
#output the file
hf_metadata = create_hf_metadata_yaml(metadata)
output_file = codecs.open(fp, "w", "utf-8")
readme = f"{hf_metadata}\n{rendered_file}"
output_file.write(readme)
output_file.close()

def create_hf_metadata_yaml(metadata: dict):
yaml_content = yaml.dump({
'language': [l['code'] for l in metadata['languages']],
'pretty_name': get_value(metadata, 'short_description')
},
allow_unicode=True
)
return f"---\n{yaml_content}---"
108 changes: 88 additions & 20 deletions sb2hf.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,25 @@
"""
Takes and creates an HuggingFace repository
sb2hf.py
A tool to automatically upload a Språkbanken resource to the Hugging Face Hub.
"""

import argparse
import json
import os
import logging
import shutil
import requests

import pandas as pd

from huggingface_hub import create_repo, HfApi, login, whoami

from helpers import get_bibtex_from_doi, get_value
from hf_gen.create_docs import write_readme
from hf_gen.dataloader import load_corpus_file
from urlparser import URLReader


Expand All @@ -16,39 +29,94 @@ def write_repository(url_reader):
if os.path.exists(output_folder):
shutil.rmtree(output_folder)
os.mkdir(output_folder)
url_reader.download_file(to=output_folder)
with open(f'{output_folder}/{url_reader.resource_name}_config.json', 'w') as f:
config = {
'url': url_reader.url,
'homepage': url_reader.url,
'resource_name': url_reader.resource_name,
'description': None,
'citation': None
}
json.dump(config, f, indent=4, sort_keys=True)
shutil.copyfile('hf_gen/dataset_loading_script.py', f'{output_folder}/{url_reader.resource_name}.py')
metadata_query = f"{args.sbx_metadata_api}/metadata?resource={url_reader.resource_name}"
logging.info(f"Fetching metadata from {metadata_query}")
metadata = requests.get(metadata_query).json()
write_readme(url_reader, metadata, f'{output_folder}/README.md')
if args.upload_data:
url_reader.download_file(to=output_folder)
resource_fp = f'{output_folder}/{url_reader.bz2_local_path}'
tsv_fp = f'{output_folder}/all.tsv'
print(f"Converting {resource_fp} -> {tsv_fp}")
pd.DataFrame(load_corpus_file(resource_fp), columns = ['text', 'id']).to_csv(tsv_fp, sep='\t')
os.remove(resource_fp)
else:
with open(f'{output_folder}/{url_reader.resource_name}_config.json', 'w') as f:
desc = get_value(metadata, 'description')
config = {
'url': url_reader.url,
'homepage': url_reader.url,
'resource_name': url_reader.resource_name,
'description': desc,
'citation': get_bibtex_from_doi(metadata['doi']).decode("utf-8")
}
json.dump(config, f, indent=4, sort_keys=True, ensure_ascii=False)
shutil.copyfile('hf_gen/dataset_loading_script.py', f'{output_folder}/{url_reader.resource_name}.py')


def sb2hf():
url_reader = URLReader(args.url)
write_repository(url_reader)
print("Finished. Now you just need to commit the repository to HuggingFace.")
# TODO Create repository on HuggingFace programatically.
write_repository(url_reader)
if args.push_to_hub:
if not args.hf_token:
try:
user_info = whoami()
print(f"User is authenticated: {user_info['name']}")
except Exception:
try:
login()
except Exception:
print("Could not authenticate user.")
else:
print("Using API token for authentication.")
repo_id, repo_type = f"{args.hf_namespace}/{url_reader.resource_name}", "dataset"
create_repo(
repo_id=repo_id,
repo_type=repo_type,
private=args.hf_create_private_repo,
token=args.hf_token,
exist_ok=True,
)
api = HfApi()
api.upload_folder(
folder_path=args.hf_output_folder,
repo_id=repo_id,
repo_type=repo_type,
)
else:
print("Finished. Now you just need to upload the repository to HuggingFace.")


if __name__ == '__main__':
parser = argparse.ArgumentParser(
parser = argparse.ArgumentParser(
prog='sb2hf',
description='Creates a HuggingFace repository from an existing SB resource')
parser.add_argument('url', help='URL to SB resource file (usually xml)')
parser.add_argument('--hf-output-folder', help='Where to locally save the resulting HuggingFace repository.')
parser.add_argument('--push-to-hub', help='If activated, pushes generated repository directly to hub.')
parser.add_argument('url', help='URL to resource page')
parser.add_argument('--hf-output-folder', help="Where to locally save the resulting HuggingFace repository.")
parser.add_argument('--push-to-hub', help="If activated, pushes generated repository directly to hub.", action='store_true', default=False)
parser.add_argument('--hf-namespace', help="Huggingface user or organization to push dataset to", default='sbx')
parser.add_argument('--hf-create-private-repo', help="Huggingface user or organization to push dataset to", default=True)
parser.add_argument('--hf-token', help="Huggingface User Access Token to authenticate to the Hub", default=os.environ.get('HF_TOKEN', None))
parser.add_argument('--sbx-metadata-api', help="API back-end to fetch information about SBX resources", default="https://ws.spraakbanken.gu.se/ws")
parser.add_argument('--upload-data',
help="""If set to True, the data (XML and JSON)
are uploaded to HuggingFace and uses the automated. Otherwise a
dataset loading script is uploaded which converts the data in
real time."""
, default=True)
parser.add_argument( '-log',
'--loglevel',
default='warning',
help='Provide logging level. Example --loglevel debug, default=warning' )
parser.add_argument(
'-r', '--row-output',
choices=['sentences', 'tokens'],
help='Which output the rows should be in.'
)
args = parser.parse_args()
logging.basicConfig( level=args.loglevel.upper() )
if args.hf_token and args.push_to_hub is None:
parser.error("--hf-token requires --push-to-hub")
if args.hf_output_folder is None:
args.hf_output_folder = args.url.split('/')[-1]
test = sb2hf()
sb2hf()

0 comments on commit 7773a75

Please sign in to comment.