diff --git a/README.md b/README.md index 993ac3e..0f8e40b 100644 --- a/README.md +++ b/README.md @@ -32,12 +32,12 @@ that generated the Text-Fabric data set for these quotes. # Getting started Start with the -[tutorial](https://nbviewer.jupyter.org/github/annotation/tutorials/blob/master/banks/use.ipynb). +[tutorial](https://nbviewer.jupyter.org/github/annotation/banks/blob/master/tutorial/use.ipynb). Or fire up a terminal and say (provided you have text-fabric installed): ``` -text-fabric banks +text-fabric annotation/banks ``` # Author diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/app.py b/app/app.py new file mode 100644 index 0000000..8dc7cde --- /dev/null +++ b/app/app.py @@ -0,0 +1,15 @@ +from tf.advanced.app import App + + +class TfApp(App): + def __init__(app, *args, **kwargs): + super().__init__(*args, **kwargs) + + def fmt_layoutRich(app, n, **kwargs): + api = app.api + F = api.F + after = f'{F.punc.v(n) or ""} ' + isGap = F.gap.v(n) + material = F.letters.v(n) or "" + layout = f'{material}' if isGap else material + return f"{layout}{after}" diff --git a/app/config.yaml b/app/config.yaml new file mode 100644 index 0000000..ee895dc --- /dev/null +++ b/app/config.yaml @@ -0,0 +1,28 @@ +apiVersion: 3 +dataDisplay: + textFormats: + layout-orig-full: + method: layoutRich + style: normal +docs: + docBase: '{docRoot}/{org}/{repo}/blob/master/programs' + docExt: .ipynb + docPage: convert + docRoot: '{urlNb}' + featureBase: '{docBase}' + featurePage: convert +interfaceDefaults: {} +provenanceSpec: + corpus: Two quotes from Consider Phlebas by Iain M. Banks + doi: 10.5281/zenodo.2630416 + version: '0.2' +typeDisplay: + book: + featuresBare: author + line: + features: terminator + label: '{number}' + template: '{number}' + verselike: true + word: + features: gap diff --git a/app/static/logo.png b/app/static/logo.png new file mode 100644 index 0000000..2789817 Binary files /dev/null and b/app/static/logo.png differ diff --git a/tutorial/app.ipynb b/tutorial/app.ipynb new file mode 100644 index 0000000..a75c6d3 --- /dev/null +++ b/tutorial/app.ipynb @@ -0,0 +1,935 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "---\n", + "Start with [convert](https://nbviewer.jupyter.org/github/annotation/banks/blob/master/programs/convert.ipynb)\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# The Banks example corpus as app" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from tf.app import use" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We do not only load the main corpus data, but also the additional *sim* (similarity) feature that is in a\n", + "module.\n", + "\n", + "For the very last version, use `hot`.\n", + "\n", + "For the latest release, use `latest`.\n", + "\n", + "If you have cloned the repos (TF app and data), use `clone`.\n", + "\n", + "If you do not want/need to upgrade, leave out the checkout specifiers." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "TF-app: ~/github/annotation/app-banks/code" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "data: ~/github/annotation/banks/tf/0.2" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "data: ~/github/annotation/banks/sim/tf/0.2" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " | 0.01s T sim from ~/github/annotation/banks/sim/tf/0.2\n" + ] + }, + { + "data": { + "text/html": [ + "Text-Fabric: Text-Fabric API 8.5.13, app-banks v3, Search Reference
Data: BANKS, Character table, Feature docs
Features:
annotation/banks/sim/tfsim
Two quotes from Consider Phlebas by Iain M. Banksauthor
gap
letters
number
otype
punc
terminator
title
oslots
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Text-Fabric API: names N F E L T S C TF directly usable

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "A = use(\n", + " \"banks:clone\",\n", + " checkout=\"clone\",\n", + " mod=\"annotation/banks/sim/tf:clone\",\n", + " hoist=globals(),\n", + ")\n", + "# A = use('banks:hot', checkout=\"hot\", mod='annotation/banks/sim/tf:hot', hoist=globals())\n", + "# A = use('banks:latest', checkout=\"latest\", mod='annotation/banks/sim/tf:latest', hoist=globals())\n", + "# A = use('banks', mod='annotation/banks/sim/tf', hoist=globals())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Use the similarity edge feature\n", + "\n", + "We print all similar pairs of words that are at least 50% similar but not 100%." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"\"\"\n", + "word\n", + "50> word\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0.01s 170 results\n" + ] + } + ], + "source": [ + "results = A.search(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
nwordword
1Consider Phlebas 1:1  Everything Consider Phlebas 1:1  everything
2Consider Phlebas 1:1  Everything Consider Phlebas 1:1  everything
3Consider Phlebas 1:1  us, Consider Phlebas 1:1  us,
4Consider Phlebas 1:1  everything Consider Phlebas 1:1  Everything
5Consider Phlebas 1:1  everything Consider Phlebas 1:1  everything
6Consider Phlebas 1:1  us, Consider Phlebas 1:1  us,
7Consider Phlebas 1:1  everything Consider Phlebas 1:1  Everything
8Consider Phlebas 1:1  everything Consider Phlebas 1:1  everything
9Consider Phlebas 1:1  we Consider Phlebas 1:2  we
10Consider Phlebas 1:1  we Consider Phlebas 1:2  we
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "A.table(results, end=10, withPassage=\"1 2\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

result 1

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Consider Phlebas 1:1
sentence
line
Everything
about
us,
line
everything
around
us,
line
everything
we
know
and
can
know
of
line
is
composed
ultimately
of
patterns
of
nothing;
line
that’s
the
bottom
line,
the
final
truth.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

result 2

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Consider Phlebas 1:1
sentence
line
Everything
about
us,
line
everything
around
us,
line
everything
we
know
and
can
know
of
line
is
composed
ultimately
of
patterns
of
nothing;
line
that’s
the
bottom
line,
the
final
truth.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

result 3

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Consider Phlebas 1:1
sentence
line
Everything
about
us,
line
everything
around
us,
line
everything
we
know
and
can
know
of
line
is
composed
ultimately
of
patterns
of
nothing;
line
that’s
the
bottom
line,
the
final
truth.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

result 4

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Consider Phlebas 1:1
sentence
line
Everything
about
us,
line
everything
around
us,
line
everything
we
know
and
can
know
of
line
is
composed
ultimately
of
patterns
of
nothing;
line
that’s
the
bottom
line,
the
final
truth.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

result 5

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Consider Phlebas 1:1
sentence
line
Everything
about
us,
line
everything
around
us,
line
everything
we
know
and
can
know
of
line
is
composed
ultimately
of
patterns
of
nothing;
line
that’s
the
bottom
line,
the
final
truth.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "A.show(results, end=5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We sort each pair.\n", + "We keep track of pairs we have seen in order to prevent printing duplicate pairs." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "know ~ own\n", + "harness ~ patterns\n", + "nothing ~ things\n", + "that ~ that’s\n", + "the ~ those\n", + "bottom ~ most\n", + "life ~ line\n", + "societies ~ those\n", + "not ~ to\n", + "make ~ take\n", + "elegant ~ languages\n", + "mattered ~ terms\n", + "left ~ life\n", + "humans ~ mountains\n", + "care ~ romance\n", + "studying ~ things\n", + "impossible ~ problems\n" + ] + } + ], + "source": [ + "seen = set()\n", + "for (w1, w2) in results:\n", + " if (w2, 100) in E.sim.b(w1):\n", + " continue\n", + " letters1 = F.letters.v(w1)\n", + " letters2 = F.letters.v(w2)\n", + " pair = tuple(sorted((letters1, letters2)))\n", + " if pair in seen:\n", + " continue\n", + " seen.add(pair)\n", + " print(\" ~ \".join(pair))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "All chapters:\n", + "\n", + "* [use](use.ipynb)\n", + "* [share](share.ipynb)\n", + "* *app*\n", + "* [repo](repo.ipynb)\n", + "* [compose](compose.ipynb)\n", + "\n", + "---\n", + "\n", + "CC-BY Dirk Roorda" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,md,py:light" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tutorial/app.md b/tutorial/app.md new file mode 100644 index 0000000..d7ff085 --- /dev/null +++ b/tutorial/app.md @@ -0,0 +1,111 @@ +--- +jupyter: + jupytext: + formats: ipynb,md,py:light + text_representation: + extension: .md + format_name: markdown + format_version: '1.3' + jupytext_version: 1.13.1 + kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + + + + + +--- +Start with [convert](https://nbviewer.jupyter.org/github/annotation/banks/blob/master/programs/convert.ipynb) + +--- + + +# The Banks example corpus as app + +```python +%load_ext autoreload +%autoreload 2 +``` + +```python +from tf.app import use +``` + +We do not only load the main corpus data, but also the additional *sim* (similarity) feature that is in a +module. + +For the very last version, use `hot`. + +For the latest release, use `latest`. + +If you have cloned the repos (TF app and data), use `clone`. + +If you do not want/need to upgrade, leave out the checkout specifiers. + +```python +A = use( + "banks:clone", + checkout="clone", + mod="annotation/banks/sim/tf:clone", + hoist=globals(), +) +# A = use('banks:hot', checkout="hot", mod='annotation/banks/sim/tf:hot', hoist=globals()) +# A = use('banks:latest', checkout="latest", mod='annotation/banks/sim/tf:latest', hoist=globals()) +# A = use('banks', mod='annotation/banks/sim/tf', hoist=globals()) +``` + +# Use the similarity edge feature + +We print all similar pairs of words that are at least 50% similar but not 100%. + +```python +query = """ +word +50> word +""" +``` + +```python +results = A.search(query) +``` + +```python +A.table(results, end=10, withPassage="1 2") +``` + +```python +A.show(results, end=5) +``` + +We sort each pair. +We keep track of pairs we have seen in order to prevent printing duplicate pairs. + +```python +seen = set() +for (w1, w2) in results: + if (w2, 100) in E.sim.b(w1): + continue + letters1 = F.letters.v(w1) + letters2 = F.letters.v(w2) + pair = tuple(sorted((letters1, letters2))) + if pair in seen: + continue + seen.add(pair) + print(" ~ ".join(pair)) +``` + +--- +All chapters: + +* [use](use.ipynb) +* [share](share.ipynb) +* *app* +* [repo](repo.ipynb) +* [compose](compose.ipynb) + +--- + +CC-BY Dirk Roorda diff --git a/tutorial/app.py b/tutorial/app.py new file mode 100644 index 0000000..e451f40 --- /dev/null +++ b/tutorial/app.py @@ -0,0 +1,94 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,md,py:light +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.13.1 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# +# +# +# +# --- +# Start with [convert](https://nbviewer.jupyter.org/github/annotation/banks/blob/master/programs/convert.ipynb) +# +# --- + +# # The Banks example corpus as app + +# %load_ext autoreload +# %autoreload 2 + +from tf.app import use + +# We do not only load the main corpus data, but also the additional *sim* (similarity) feature that is in a +# module. +# +# For the very last version, use `hot`. +# +# For the latest release, use `latest`. +# +# If you have cloned the repos (TF app and data), use `clone`. +# +# If you do not want/need to upgrade, leave out the checkout specifiers. + +A = use( + "banks:clone", + checkout="clone", + mod="annotation/banks/sim/tf:clone", + hoist=globals(), +) +# A = use('banks:hot', checkout="hot", mod='annotation/banks/sim/tf:hot', hoist=globals()) +# A = use('banks:latest', checkout="latest", mod='annotation/banks/sim/tf:latest', hoist=globals()) +# A = use('banks', mod='annotation/banks/sim/tf', hoist=globals()) + +# # Use the similarity edge feature +# +# We print all similar pairs of words that are at least 50% similar but not 100%. + +query = """ +word +50> word +""" + +results = A.search(query) + +A.table(results, end=10, withPassage="1 2") + +A.show(results, end=5) + +# We sort each pair. +# We keep track of pairs we have seen in order to prevent printing duplicate pairs. + +seen = set() +for (w1, w2) in results: + if (w2, 100) in E.sim.b(w1): + continue + letters1 = F.letters.v(w1) + letters2 = F.letters.v(w2) + pair = tuple(sorted((letters1, letters2))) + if pair in seen: + continue + seen.add(pair) + print(" ~ ".join(pair)) + +# --- +# All chapters: +# +# * [use](use.ipynb) +# * [share](share.ipynb) +# * *app* +# * [repo](repo.ipynb) +# * [compose](compose.ipynb) +# +# --- +# +# CC-BY Dirk Roorda diff --git a/tutorial/combine/input/banks1/tf/0.2/author.tf b/tutorial/combine/input/banks1/tf/0.2/author.tf new file mode 100644 index 0000000..136fd8f --- /dev/null +++ b/tutorial/combine/input/banks1/tf/0.2/author.tf @@ -0,0 +1,14 @@ +@node +@compiler=Dirk Roorda +@description=the author of a book +@name=Culture quotes from Iain Banks +@purpose=exposition +@source=Good Reads +@status=with for similarities in a separate module +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@version=0.2 +@writtenBy=Text-Fabric +@dateWritten=2019-05-20T19:12:23Z + +100 Iain M. Banks diff --git a/tutorial/combine/input/banks1/tf/0.2/gap.tf b/tutorial/combine/input/banks1/tf/0.2/gap.tf new file mode 100644 index 0000000..9e8ba86 --- /dev/null +++ b/tutorial/combine/input/banks1/tf/0.2/gap.tf @@ -0,0 +1,20 @@ +@node +@compiler=Dirk Roorda +@description=1 for words that occur between [ ], which are inserted by the editor +@name=Culture quotes from Iain Banks +@purpose=exposition +@source=Good Reads +@status=with for similarities in a separate module +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=int +@version=0.2 +@writtenBy=Text-Fabric +@dateWritten=2019-05-20T19:12:23Z + +10 1 +1 +1 +1 +78 1 +1 +1 diff --git a/tutorial/combine/input/banks1/tf/0.2/letters.tf b/tutorial/combine/input/banks1/tf/0.2/letters.tf new file mode 100644 index 0000000..3c8517a --- /dev/null +++ b/tutorial/combine/input/banks1/tf/0.2/letters.tf @@ -0,0 +1,112 @@ +@node +@compiler=Dirk Roorda +@description=the letters of a word +@name=Culture quotes from Iain Banks +@purpose=exposition +@source=Good Reads +@status=with for similarities in a separate module +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@version=0.2 +@writtenBy=Text-Fabric +@dateWritten=2019-05-20T19:12:23Z + +Everything +about +us +everything +around +us +everything +we +know +and +can +know +of +is +composed +ultimately +of +patterns +of +nothing +that’s +the +bottom +line +the +final +truth +So +where +we +find +we +have +any +control +over +those +patterns +why +not +make +the +most +elegant +ones +the +most +enjoyable +and +good +ones +in +our +own +terms +Besides +it +left +the +humans +in +the +Culture +free +to +take +care +of +the +things +that +really +mattered +in +life +such +as +sports +games +romance +studying +dead +languages +barbarian +societies +and +impossible +problems +and +climbing +high +mountains +without +the +aid +of +a +safety +harness diff --git a/tutorial/combine/input/banks1/tf/0.2/number.tf b/tutorial/combine/input/banks1/tf/0.2/number.tf new file mode 100644 index 0000000..2ba656a --- /dev/null +++ b/tutorial/combine/input/banks1/tf/0.2/number.tf @@ -0,0 +1,30 @@ +@node +@compiler=Dirk Roorda +@description=number of chapter, or sentence in chapter, or line in sentence +@name=Culture quotes from Iain Banks +@purpose=exposition +@source=Good Reads +@status=with for similarities in a separate module +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=int +@version=0.2 +@writtenBy=Text-Fabric +@dateWritten=2019-05-20T19:12:23Z + +101 1 +2 +1 +2 +3 +4 +6 +7 +8 +1 +2 +3 +4 +5 +1 +2 +1 diff --git a/tutorial/combine/input/banks1/tf/0.2/oslots.tf b/tutorial/combine/input/banks1/tf/0.2/oslots.tf new file mode 100644 index 0000000..b8c9757 --- /dev/null +++ b/tutorial/combine/input/banks1/tf/0.2/oslots.tf @@ -0,0 +1,30 @@ +@edge +@compiler=Dirk Roorda +@name=Culture quotes from Iain Banks +@purpose=exposition +@source=Good Reads +@status=with for similarities in a separate module +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@version=0.2 +@writtenBy=Text-Fabric +@dateWritten=2019-05-20T19:12:23Z + +100 1-99 +1-55 +56-99 +1-3 +4-6 +7-9,14-20 +21-27 +28-38 +39-51 +52-55 +56 +57-75 +76-77,81-83 +84-88 +89-99 +1-27 +28-55 +56-99 diff --git a/tutorial/combine/input/banks1/tf/0.2/otext.tf b/tutorial/combine/input/banks1/tf/0.2/otext.tf new file mode 100644 index 0000000..4fac2eb --- /dev/null +++ b/tutorial/combine/input/banks1/tf/0.2/otext.tf @@ -0,0 +1,18 @@ +@config +@compiler=Dirk Roorda +@fmt:line-default={letters:XXX}{terminator} +@fmt:line-term=line#{terminator} +@fmt:text-orig-full={letters}{punc} +@name=Culture quotes from Iain Banks +@purpose=exposition +@sectionFeatures=title,number,number +@sectionTypes=book,chapter,sentence +@source=Good Reads +@status=with for similarities in a separate module +@structureFeatures=title,number,number,number +@structureTypes=book,chapter,sentence,line +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@version=0.2 +@writtenBy=Text-Fabric +@dateWritten=2019-05-20T19:12:23Z + diff --git a/tutorial/combine/input/banks1/tf/0.2/otype.tf b/tutorial/combine/input/banks1/tf/0.2/otype.tf new file mode 100644 index 0000000..377448a --- /dev/null +++ b/tutorial/combine/input/banks1/tf/0.2/otype.tf @@ -0,0 +1,17 @@ +@node +@compiler=Dirk Roorda +@name=Culture quotes from Iain Banks +@purpose=exposition +@source=Good Reads +@status=with for similarities in a separate module +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@version=0.2 +@writtenBy=Text-Fabric +@dateWritten=2019-05-20T19:12:23Z + +1-99 word +100 book +101-102 chapter +103-114 line +115-117 sentence diff --git a/tutorial/combine/input/banks1/tf/0.2/punc.tf b/tutorial/combine/input/banks1/tf/0.2/punc.tf new file mode 100644 index 0000000..f2ffc82 --- /dev/null +++ b/tutorial/combine/input/banks1/tf/0.2/punc.tf @@ -0,0 +1,31 @@ +@node +@compiler=Dirk Roorda +@description=the punctuation after a word +@name=Culture quotes from Iain Banks +@purpose=exposition +@remark=a bit more info is needed +@source=Good Reads +@status=with for similarities in a separate module +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@version=0.2 +@writtenBy=Text-Fabric +@dateWritten=2019-05-20T19:12:23Z + +3 , +6 , +20 ; +24 , +27 . +38 , +45 , +51 , +55 ? +, +75 , +78 , +, +, +83 , +88 , +99 . diff --git a/tutorial/combine/input/banks1/tf/0.2/terminator.tf b/tutorial/combine/input/banks1/tf/0.2/terminator.tf new file mode 100644 index 0000000..c59b175 --- /dev/null +++ b/tutorial/combine/input/banks1/tf/0.2/terminator.tf @@ -0,0 +1,25 @@ +@node +@compiler=Dirk Roorda +@description=the last character of a line +@name=Culture quotes from Iain Banks +@purpose=exposition +@source=Good Reads +@status=with for similarities in a separate module +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@version=0.2 +@writtenBy=Text-Fabric +@dateWritten=2019-05-20T19:12:23Z + +103 , +, +; +. +, +, +? +, +, +, +, +. diff --git a/tutorial/combine/input/banks1/tf/0.2/title.tf b/tutorial/combine/input/banks1/tf/0.2/title.tf new file mode 100644 index 0000000..286048d --- /dev/null +++ b/tutorial/combine/input/banks1/tf/0.2/title.tf @@ -0,0 +1,14 @@ +@node +@compiler=Dirk Roorda +@description=the title of a book +@name=Culture quotes from Iain Banks +@purpose=exposition +@source=Good Reads +@status=with for similarities in a separate module +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@version=0.2 +@writtenBy=Text-Fabric +@dateWritten=2019-05-20T19:12:23Z + +100 Consider Phlebas diff --git a/tutorial/combine/input/banks2/tf/0.2/author.tf b/tutorial/combine/input/banks2/tf/0.2/author.tf new file mode 100644 index 0000000..d866afb --- /dev/null +++ b/tutorial/combine/input/banks2/tf/0.2/author.tf @@ -0,0 +1,14 @@ +@node +@compiler=Berend Roorda +@description=the author of a book +@name=Culture quotes from Iain Banks +@purpose=testing +@source=Good Reads +@status=with for similarities in a separate module +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@version=0.3 +@writtenBy=Text-Fabric +@dateWritten=2019-05-20T19:12:23Z + +100 Iain M. Banks diff --git a/tutorial/combine/input/banks2/tf/0.2/gap.tf b/tutorial/combine/input/banks2/tf/0.2/gap.tf new file mode 100644 index 0000000..9e8ba86 --- /dev/null +++ b/tutorial/combine/input/banks2/tf/0.2/gap.tf @@ -0,0 +1,20 @@ +@node +@compiler=Dirk Roorda +@description=1 for words that occur between [ ], which are inserted by the editor +@name=Culture quotes from Iain Banks +@purpose=exposition +@source=Good Reads +@status=with for similarities in a separate module +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=int +@version=0.2 +@writtenBy=Text-Fabric +@dateWritten=2019-05-20T19:12:23Z + +10 1 +1 +1 +1 +78 1 +1 +1 diff --git a/tutorial/combine/input/banks2/tf/0.2/letters.tf b/tutorial/combine/input/banks2/tf/0.2/letters.tf new file mode 100644 index 0000000..d05f7fc --- /dev/null +++ b/tutorial/combine/input/banks2/tf/0.2/letters.tf @@ -0,0 +1,112 @@ +@node +@compiler=Dirk Roorda +@description=the signs of a word +@name=Culture quotes from Iain Banks +@purpose=exposition +@source=Good Reads +@status=with for similarities in a separate module +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@version=0.2 +@writtenBy=Text-Fabric +@dateWritten=2019-05-20T19:12:23Z + +Everything +about +us +everything +around +us +everything +we +know +and +can +know +of +is +composed +ultimately +of +patterns +of +nothing +that’s +the +bottom +line +the +final +truth +So +where +we +find +we +have +any +control +over +those +patterns +why +not +make +the +most +elegant +ones +the +most +enjoyable +and +good +ones +in +our +own +terms +Besides +it +left +the +humans +in +the +Culture +free +to +take +care +of +the +things +that +really +mattered +in +life +such +as +sports +games +romance +studying +dead +languages +barbarian +societies +and +impossible +problems +and +climbing +high +mountains +without +the +aid +of +a +safety +harness diff --git a/tutorial/combine/input/banks2/tf/0.2/number.tf b/tutorial/combine/input/banks2/tf/0.2/number.tf new file mode 100644 index 0000000..2ba656a --- /dev/null +++ b/tutorial/combine/input/banks2/tf/0.2/number.tf @@ -0,0 +1,30 @@ +@node +@compiler=Dirk Roorda +@description=number of chapter, or sentence in chapter, or line in sentence +@name=Culture quotes from Iain Banks +@purpose=exposition +@source=Good Reads +@status=with for similarities in a separate module +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=int +@version=0.2 +@writtenBy=Text-Fabric +@dateWritten=2019-05-20T19:12:23Z + +101 1 +2 +1 +2 +3 +4 +6 +7 +8 +1 +2 +3 +4 +5 +1 +2 +1 diff --git a/tutorial/combine/input/banks2/tf/0.2/oslots.tf b/tutorial/combine/input/banks2/tf/0.2/oslots.tf new file mode 100644 index 0000000..b8c9757 --- /dev/null +++ b/tutorial/combine/input/banks2/tf/0.2/oslots.tf @@ -0,0 +1,30 @@ +@edge +@compiler=Dirk Roorda +@name=Culture quotes from Iain Banks +@purpose=exposition +@source=Good Reads +@status=with for similarities in a separate module +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@version=0.2 +@writtenBy=Text-Fabric +@dateWritten=2019-05-20T19:12:23Z + +100 1-99 +1-55 +56-99 +1-3 +4-6 +7-9,14-20 +21-27 +28-38 +39-51 +52-55 +56 +57-75 +76-77,81-83 +84-88 +89-99 +1-27 +28-55 +56-99 diff --git a/tutorial/combine/input/banks2/tf/0.2/otext.tf b/tutorial/combine/input/banks2/tf/0.2/otext.tf new file mode 100644 index 0000000..af6b405 --- /dev/null +++ b/tutorial/combine/input/banks2/tf/0.2/otext.tf @@ -0,0 +1,19 @@ +@config +@compiler=Dirk Roorda +@fmt:line-default={letters:XXX}{terminator} +@fmt:line-term=line#{terminator} +@fmt:text-orig-full={letters}{gap} +@fmt:text-orig-extra={letters}{punc}{gap} +@name=Culture quotes from Iain Banks +@purpose=exposition +@sectionFeatures=number,number,number +@sectionTypes=chapter,sentence,line +@source=Good Reads +@status=with for similarities in a separate module +@structureFeatures=title,number,number +@structureTypes=book,chapter,sentence +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@version=0.2 +@writtenBy=Text-Fabric +@dateWritten=2019-05-20T19:12:23Z + diff --git a/tutorial/combine/input/banks2/tf/0.2/otype.tf b/tutorial/combine/input/banks2/tf/0.2/otype.tf new file mode 100644 index 0000000..377448a --- /dev/null +++ b/tutorial/combine/input/banks2/tf/0.2/otype.tf @@ -0,0 +1,17 @@ +@node +@compiler=Dirk Roorda +@name=Culture quotes from Iain Banks +@purpose=exposition +@source=Good Reads +@status=with for similarities in a separate module +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@version=0.2 +@writtenBy=Text-Fabric +@dateWritten=2019-05-20T19:12:23Z + +1-99 word +100 book +101-102 chapter +103-114 line +115-117 sentence diff --git a/tutorial/combine/input/banks2/tf/0.2/punc.tf b/tutorial/combine/input/banks2/tf/0.2/punc.tf new file mode 100644 index 0000000..f2ffc82 --- /dev/null +++ b/tutorial/combine/input/banks2/tf/0.2/punc.tf @@ -0,0 +1,31 @@ +@node +@compiler=Dirk Roorda +@description=the punctuation after a word +@name=Culture quotes from Iain Banks +@purpose=exposition +@remark=a bit more info is needed +@source=Good Reads +@status=with for similarities in a separate module +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@version=0.2 +@writtenBy=Text-Fabric +@dateWritten=2019-05-20T19:12:23Z + +3 , +6 , +20 ; +24 , +27 . +38 , +45 , +51 , +55 ? +, +75 , +78 , +, +, +83 , +88 , +99 . diff --git a/tutorial/combine/input/banks2/tf/0.2/terminator.tf b/tutorial/combine/input/banks2/tf/0.2/terminator.tf new file mode 100644 index 0000000..c59b175 --- /dev/null +++ b/tutorial/combine/input/banks2/tf/0.2/terminator.tf @@ -0,0 +1,25 @@ +@node +@compiler=Dirk Roorda +@description=the last character of a line +@name=Culture quotes from Iain Banks +@purpose=exposition +@source=Good Reads +@status=with for similarities in a separate module +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@version=0.2 +@writtenBy=Text-Fabric +@dateWritten=2019-05-20T19:12:23Z + +103 , +, +; +. +, +, +? +, +, +, +, +. diff --git a/tutorial/combine/input/banks2/tf/0.2/title.tf b/tutorial/combine/input/banks2/tf/0.2/title.tf new file mode 100644 index 0000000..286048d --- /dev/null +++ b/tutorial/combine/input/banks2/tf/0.2/title.tf @@ -0,0 +1,14 @@ +@node +@compiler=Dirk Roorda +@description=the title of a book +@name=Culture quotes from Iain Banks +@purpose=exposition +@source=Good Reads +@status=with for similarities in a separate module +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@version=0.2 +@writtenBy=Text-Fabric +@dateWritten=2019-05-20T19:12:23Z + +100 Consider Phlebas diff --git a/tutorial/compose.ipynb b/tutorial/compose.ipynb new file mode 100644 index 0000000..6726861 --- /dev/null +++ b/tutorial/compose.ipynb @@ -0,0 +1,2130 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "---\n", + "Start with [convert](https://nbviewer.jupyter.org/github/annotation/banks/blob/master/programs/convert.ipynb)\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Compose\n", + "\n", + "This is about combining multiple TF datasets into one, and then tweaking it further.\n", + "\n", + "In the previous chapters of this tutorial you have learned how to add new features to an existing dataset.\n", + "\n", + "Here you learn how you can combine dozens of slightly heterogeneous TF data sets,\n", + "and apply structural tweaks to the node types and features later on.\n", + "\n", + "The incentive to write these composition functions into Text-Fabric came from Ernst Boogert while he was\n", + "converting between 100 and 200 works by the Church Fathers (Patristics).\n", + "The conversion did a very good job in getting all the information from TEI files with different structures into TF,\n", + "one dataset per work.\n", + "\n", + "Then the challenge became to combine them into one big dataset, and to merge several node types into one type,\n", + "and several features into one.\n", + "\n", + "See [patristics](https://github.com/pthu/patristics)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The new functions are `collect()` and `modify()`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from tf.fabric import Fabric\n", + "from tf.dataset import modify\n", + "from tf.volumes import collect" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Corpus\n", + "\n", + "We use two copies of our example corpus Banks, present in this repository." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## Collect\n", + "\n", + "The *collect* function takes any number of directory locations, and considers each location to be the\n", + "host of a TF data set.\n", + "\n", + "You can pass this list straight to the `collect()` function as the `locations` parameter,\n", + "or you can add names to the individual corpora.\n", + "In that case, you pass an iterable of (`name`, `location`) pairs into the `locations` parameter.\n", + "\n", + "Here we give the first copy the name `banks`, and the second copy the name `river`.\n", + "\n", + "We also specify the output location." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "PREFIX = \"combine/input\"\n", + "SUFFIX = \"tf/0.2\"\n", + "\n", + "locations = (\n", + " (\"banks\", f\"{PREFIX}/banks1/{SUFFIX}\"),\n", + " (\"rivers\", f\"{PREFIX}/banks2/{SUFFIX}\"),\n", + ")\n", + "\n", + "COMBINED = \"combine/_temp/riverbanks\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We are going to call the `collect()` function.\n", + "\n", + "But first we clear the output location.\n", + "\n", + "Note how you can mix a bash-shell command with your Python code." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0.00s Loading volume banks from combine/input/banks1/tf/0.2 ...\n", + "This is Text-Fabric 9.1.3\n", + "Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html\n", + "\n", + "10 features found and 0 ignored\n", + " 0.00s loading features ...\n", + " 0.01s All features loaded/computed - for details use TF.isLoaded()\n", + " | 0.00s Feature overview: 8 for nodes; 1 for edges; 1 configs; 8 computed\n", + " 0.00s loading features ...\n", + " 0.00s All additional features loaded - for details use TF.isLoaded()\n", + " 0.02s Loading volume rivers from combine/input/banks2/tf/0.2 ...\n", + "This is Text-Fabric 9.1.3\n", + "Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html\n", + "\n", + "10 features found and 0 ignored\n", + " 0.00s loading features ...\n", + " 0.01s All features loaded/computed - for details use TF.isLoaded()\n", + " | 0.00s Feature overview: 8 for nodes; 1 for edges; 1 configs; 8 computed\n", + " 0.00s loading features ...\n", + " 0.00s All additional features loaded - for details use TF.isLoaded()\n", + " 0.04s inspect metadata ...\n", + "WARNING: otext.structureFeatures metadata varies across volumes\n", + "WARNING: otext.structureTypes metadata varies across volumes\n", + "WARNING: author.compiler metadata varies across volumes\n", + "WARNING: author.purpose metadata varies across volumes\n", + "WARNING: letters.description metadata varies across volumes\n", + " 0.04s metadata sorted out\n", + " 0.04s check nodetypes ...\n", + " | volume banks\n", + " | volume rivers\n", + " 0.04s node types ok\n", + " 0.04s Collect nodes from volumes ...\n", + " | 0.00s Check against overlapping slots ...\n", + " | | banks : 99 slots\n", + " | | rivers : 99 slots\n", + " | 0.00s no overlap\n", + " | 0.00s Group non-slot nodes by type\n", + " | | banks : 100- 117\n", + " | | rivers : 100- 117\n", + " | 0.00s Mapping nodes from volume to/from work ...\n", + " | | book : 199 - 200\n", + " | | chapter : 201 - 204\n", + " | | line : 205 - 228\n", + " | | sentence : 229 - 234\n", + " | 0.01s The new work has 236 nodes of which 198 slots\n", + " 0.05s collection done\n", + " 0.05s remap features ...\n", + " 0.05s remapping done\n", + " 0.05s write work as TF data set\n", + " 0.07s writing done\n", + " 0.07s done\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output = COMBINED\n", + "\n", + "!rm -rf {output}\n", + "\n", + "collect(\n", + " locations,\n", + " output,\n", + " volumeType=\"volume\",\n", + " volumeFeature=\"title\",\n", + " featureMeta=dict(\n", + " otext=dict(\n", + " sectionTypes=\"volume,chapter,line\",\n", + " sectionFeatures=\"title,number,number\",\n", + " **{\"fmt:text-orig-full\": \"{letters} \"},\n", + " ),\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This function is a bit verbose in its output, but a lot happens under the hood, and if your dataset is large,\n", + "it may take several minutes. It is pleasant to see the progress under those circumstances.\n", + "\n", + "But for now, we pass `silent=True`, to make everything a bit more quiet." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING: otext.structureFeatures metadata varies across volumes\n", + "WARNING: otext.structureTypes metadata varies across volumes\n", + "WARNING: author.compiler metadata varies across volumes\n", + "WARNING: author.purpose metadata varies across volumes\n", + "WARNING: letters.description metadata varies across volumes\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output = COMBINED\n", + "\n", + "!rm -rf {output}\n", + "\n", + "collect(\n", + " locations,\n", + " output,\n", + " volumeType=\"volume\",\n", + " volumeFeature=\"title\",\n", + " featureMeta=dict(\n", + " otext=dict(\n", + " sectionTypes=\"volume,chapter,line\",\n", + " sectionFeatures=\"title,number,number\",\n", + " **{\"fmt:text-orig-full\": \"{letters} \"},\n", + " ),\n", + " ),\n", + " silent=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There you are, on your file system you see the combined dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 88\n", + "-rw-r--r-- 1 dirk staff 559 Nov 4 16:04 author.tf\n", + "-rw-r--r-- 1 dirk staff 524 Nov 4 16:04 gap.tf\n", + "-rw-r--r-- 1 dirk staff 1619 Nov 4 16:04 letters.tf\n", + "-rw-r--r-- 1 dirk staff 548 Nov 4 16:04 number.tf\n", + "-rw-r--r-- 1 dirk staff 681 Nov 4 16:04 oslots.tf\n", + "-rw-r--r-- 1 dirk staff 1062 Nov 4 16:04 otext.tf\n", + "-rw-r--r-- 1 dirk staff 485 Nov 4 16:04 otype.tf\n", + "-rw-r--r-- 1 dirk staff 2747 Nov 4 16:04 ovolume.tf\n", + "-rw-r--r-- 1 dirk staff 640 Nov 4 16:04 punc.tf\n", + "-rw-r--r-- 1 dirk staff 494 Nov 4 16:04 terminator.tf\n", + "-rw-r--r-- 1 dirk staff 563 Nov 4 16:04 title.tf\n" + ] + } + ], + "source": [ + "!ls -l {output}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we compare that with one of the input:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 80\n", + "-rw-r--r-- 1 dirk staff 359 May 20 2019 author.tf\n", + "-rw-r--r-- 1 dirk staff 409 May 20 2019 gap.tf\n", + "-rw-r--r-- 1 dirk staff 911 May 20 2019 letters.tf\n", + "-rw-r--r-- 1 dirk staff 421 May 20 2019 number.tf\n", + "-rw-r--r-- 1 dirk staff 419 May 20 2019 oslots.tf\n", + "-rw-r--r-- 1 dirk staff 572 May 20 2019 otext.tf\n", + "-rw-r--r-- 1 dirk staff 372 May 30 2019 otype.tf\n", + "-rw-r--r-- 1 dirk staff 457 May 20 2019 punc.tf\n", + "-rw-r--r-- 1 dirk staff 377 May 20 2019 terminator.tf\n", + "-rw-r--r-- 1 dirk staff 361 May 20 2019 title.tf\n" + ] + } + ], + "source": [ + "!ls -l {PREFIX}/banks1/{SUFFIX}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "then we see the same files (with the addition of *ovolume.tf*\n", + "but smaller file sizes.\n", + "\n", + "## Result\n", + "\n", + "Let's have a look inside, and note that we use the TF function `loadAll()`\n", + "which loads all loadable features." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is Text-Fabric 9.1.3\n", + "Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html\n", + "\n", + "11 features found and 0 ignored\n", + " 0.00s loading features ...\n", + " | 0.00s T otype from combine/_temp/riverbanks\n", + " | 0.00s T oslots from combine/_temp/riverbanks\n", + " | 0.00s Dataset without structure sections in otext:no structure functions in the T-API\n", + " | 0.00s T number from combine/_temp/riverbanks\n", + " | 0.00s T punc from combine/_temp/riverbanks\n", + " | 0.00s T gap from combine/_temp/riverbanks\n", + " | 0.00s T terminator from combine/_temp/riverbanks\n", + " | 0.00s T title from combine/_temp/riverbanks\n", + " | 0.00s T letters from combine/_temp/riverbanks\n", + " | | 0.00s C __levels__ from otype, oslots, otext\n", + " | | 0.00s C __order__ from otype, oslots, __levels__\n", + " | | 0.00s C __rank__ from otype, __order__\n", + " | | 0.00s C __levUp__ from otype, oslots, __rank__\n", + " | | 0.00s C __levDown__ from otype, __levUp__, __rank__\n", + " | | 0.00s C __boundary__ from otype, oslots, __rank__\n", + " | | 0.00s C __sections__ from otype, oslots, otext, __levUp__, __levels__, title, number, number\n", + " 0.03s All features loaded/computed - for details use TF.isLoaded()\n", + " | 0.00s Feature overview: 9 for nodes; 1 for edges; 1 configs; 8 computed\n", + " 0.00s loading features ...\n", + " | 0.00s T author from combine/_temp/riverbanks\n", + " | 0.00s T ovolume from combine/_temp/riverbanks\n", + " 0.01s All additional features loaded - for details use TF.isLoaded()\n" + ] + } + ], + "source": [ + "TF = Fabric(locations=COMBINED)\n", + "api = TF.loadAll(silent=False)\n", + "docs = api.makeAvailableIn(globals())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We look up the section of the first word:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('banks', 1, 1)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "T.sectionFromNode(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The component sets had 99 words each. So what is the section of word 100?" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('rivers', 1, 1)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "T.sectionFromNode(100)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Right, that's the first word of the second component.\n", + "\n", + "Here is an overview of all the node types in the combined set.\n", + "\n", + "The second field is the average length in words for nodes of that type, the remaining fields give\n", + "the first and last node of that type." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(('book', 99.0, 199, 200),\n", + " ('volume', 99.0, 235, 236),\n", + " ('chapter', 49.5, 201, 204),\n", + " ('sentence', 33.0, 229, 234),\n", + " ('line', 7.666666666666667, 205, 228),\n", + " ('word', 1, 1, 198))" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "C.levels.data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The combined data set consists of the concatenation of all slot nodes of the component data sets.\n", + "\n", + "Note that the individual components have got a top node, of type `volume`.\n", + "This is the effect of specifying `componentType='volume'`.\n", + "\n", + "There is also a feature for volumes, named `title`, that contains their name, or if we haven't passed their names\n", + "in the `locations` parameter, their location.\n", + "This is the effect of `componentFeature='title'`.\n", + "\n", + "Let's check.\n", + "\n", + "We use the new `.items()` method on features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_items([(199, 'Consider Phlebas'), (200, 'Consider Phlebas'), (235, 'banks'), (236, 'rivers')])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "F.title.items()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see several things:\n", + "\n", + "* the volume nodes indeed got the component name in the feature `title`\n", + "* the other nodes that already had a title, the `book` nodes, still have the same value for `title` as before.\n", + "\n", + "### The merging principle\n", + "\n", + "This is a general principle that we see over and over again: when we combine data, we merge as much as possible.\n", + "\n", + "That means that when you create new features, you may use the names of old features, and the new information for that\n", + "feature will be merged with the old information of that feature." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Modify\n", + "\n", + "Although combining has its complications, the most complex operation is `modify()` because it can do many things.\n", + "\n", + "It operates on a single TF dataset, and it produces a modified dataset as a fresh \"copy\".\n", + "\n", + "Despite the name, no actual modification takes place on the input dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "location = f\"{PREFIX}/banks1/{SUFFIX}\"\n", + "\n", + "MODIFIED = \"_temp/mudbanks\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we take the first local copy of the Banks dataset as our input, for a lot of different operations." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is the list what `modify()` can do.\n", + "The order is important, because all operations are executed in this order:\n", + "\n", + "1. **merge features**: several input features are combined into a single output feature and then deleted;\n", + "2. **delete features**: several features are be deleted\n", + "3. **add features**: several node/edge features with their data are added to the dataset\n", + "4. **merge types**: several input node types are combined into a single output node type;\n", + " the input nodetypes are deleted, but not their nodes: they are now part of the output node type;\n", + "5. **delete types**: several node types are deleted, *with their nodes*, and all features\n", + " will be remapped to accomodate for this;\n", + "6. **add types**: several new node types with additional feature data for them are added after the last node;\n", + " features do not have to be remapped for this; the new node types may be arbitrary intervals of integers and\n", + " have no relationship with the existing nodes.\n", + "7. **modify metadata**: the metadata of all features can be tweaked, including everything that is in the\n", + " `otext` feature, such as text formats and section structure definitions.\n", + "\n", + "Modify will perform as many sanity checks as possible before it starts working, so that the chances are good that\n", + "the modified dataset will load properly.\n", + "It will adapt the value type of features to the values encountered, and it will deduce whether edges have values or not.\n", + "\n", + "If a modified dataset does not load, while the original dataset did load, it is a bug, and I welcome a\n", + "[GitHub issue](https://github.com/annotation/text-fabric/issues)\n", + "for it." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Only meta data\n", + "\n", + "We start with the last one, the most simple one." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "otext = dict(\n", + " sectionTypes=\"book,chapter\",\n", + " sectionFeatures=\"title,number\",\n", + " **{\"fmt:text-orig-full\": \"{letters} \"},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We use `silent=True` from now on, but if you work with larger datasets, it is recommended to set `silent=False` or\n", + "to leave it out altogether." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test = \"meta\"\n", + "output = f\"{MODIFIED}.{test}\"\n", + "\n", + "!rm -rf {output}\n", + "\n", + "modify(\n", + " location,\n", + " output,\n", + " featureMeta=dict(otext=otext),\n", + " silent=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Result" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "TF = Fabric(locations=f\"{MODIFIED}.{test}\", silent=True)\n", + "api = TF.loadAll(silent=True)\n", + "docs = api.makeAvailableIn(globals())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have now only 2 section levels. If we ask for some sections, we see that we only get 2 components in the tuple." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('Consider Phlebas', 1)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "T.sectionFromNode(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('Consider Phlebas', 2)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "T.sectionFromNode(99)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Merge features\n", + "\n", + "We are going to do some tricky mergers on features that are involved in the section structure and the\n", + "text formats, so we take care to modify those by means of the `featureMeta` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "otext = dict(\n", + " sectionTypes=\"book,chapter\",\n", + " sectionFeatures=\"heading,heading\",\n", + " structureTypes=\"book,chapter\",\n", + " structureFeatures=\"heading,heading\",\n", + " **{\n", + " \"fmt:text-orig-full\": \"{content} \",\n", + " \"fmt:text-orig-fake\": \"{fake} \",\n", + " \"fmt:line-default\": \"{content:XXX}{terminator} \",\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We want sectional headings in one feature, `heading`, instead of in `title` for books and `number` for chapters.\n", + "\n", + "We also make a `content` feature that gives the `letters` of a word unless there is punctuation: then it gives `punc`.\n", + "\n", + "And we make the opposite: `fake`: it prefers `punc` over `letters`.\n", + "\n", + "Note that `punc` and `letters` will be deleted after the merge as a whole is completed, so that it is indeed\n", + "possible for features to be the input of multiple mergers." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test = \"merge.f\"\n", + "output = f\"{MODIFIED}.{test}\"\n", + "\n", + "!rm -rf {output}\n", + "\n", + "modify(\n", + " location,\n", + " output,\n", + " mergeFeatures=dict(\n", + " heading=(\"title number\"), content=(\"punc letters\"), fake=(\"letters punc\")\n", + " ),\n", + " featureMeta=dict(\n", + " otext=otext,\n", + " ),\n", + " silent=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Result" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "TF = Fabric(locations=f\"{MODIFIED}.{test}\", silent=True)\n", + "api = TF.loadAll(silent=True)\n", + "docs = api.makeAvailableIn(globals())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We inspect the new `heading` feature for a book and a chapter." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Consider Phlebas'" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "b = F.otype.s(\"book\")[0]\n", + "F.heading.v(b)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'1'" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c = F.otype.s(\"chapter\")[0]\n", + "F.heading.v(c)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And here is an overview of all node features: `title` and `number` are gone, together with `punc` and `letters`." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['author', 'content', 'fake', 'gap', 'heading', 'otype', 'terminator']" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Fall()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have modified the standard text format, `text-orig-full`. It now uses the `content` feature,\n", + "and indeed, we do not see punctuation anymore." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Everything about us everything around us everything we know '" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "T.text(range(1, 10))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "On the other hand, `text-orig-fake` uses the `fake` feature, and we see that the words in front\n", + "of punctuation have disappeared." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Everything about , everything around , everything we know '" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "T.text(range(1, 10), fmt=\"text-orig-fake\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete features\n", + "\n", + "We just remove two features from the dataset: `author` and `terminator`." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " | Missing for text API: features: terminator\n" + ] + }, + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test = \"delete.f\"\n", + "output = f\"{MODIFIED}.{test}\"\n", + "\n", + "!rm -rf {output}\n", + "\n", + "modify(\n", + " location,\n", + " output,\n", + " deleteFeatures=\"author terminator\",\n", + " silent=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Oops. `terminator` is used in a text-format, so if we delete it, the dataset will not load properly.\n", + "\n", + "Let's not delete `terminator` but `gap`." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test = \"delete.f\"\n", + "output = f\"{MODIFIED}.{test}\"\n", + "\n", + "modify(\n", + " location,\n", + " output,\n", + " deleteFeatures=\"author gap\",\n", + " silent=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Result" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "TF = Fabric(locations=f\"{MODIFIED}.{test}\", silent=True)\n", + "api = TF.loadAll(silent=True)\n", + "docs = api.makeAvailableIn(globals())" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['letters', 'number', 'otype', 'punc', 'terminator', 'title']" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Fall()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Indeed, `gap` is gone." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'NodeFeatures' object has no attribute 'gap'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mF\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgap\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfreqList\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m: 'NodeFeatures' object has no attribute 'gap'" + ] + } + ], + "source": [ + "F.gap.freqList()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I told you! Sigh ..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Add features\n", + "\n", + "We add a bunch of node features and edge features.\n", + "\n", + "When you add features, you also have to pass their data.\n", + "Here we compute that data in place, which results in a lengthy call, but usually you'll get\n", + "that data from somewhere in a dictionary, and you only pass the dictionary." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We do not have to explicitly tell the value types of the new features, `modify()` will deduced them.\n", + "We can override that by passing a value type explicitly.\n", + "\n", + "Let's declare `lemma` to be `str`, and `big` `int`:" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " | Add features: big: feature values are declared to be int but some values are not int\n" + ] + }, + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test = \"add.f\"\n", + "output = f\"{MODIFIED}.{test}\"\n", + "\n", + "!rm -rf {output}\n", + "\n", + "modify(\n", + " location,\n", + " output,\n", + " addFeatures=dict(\n", + " nodeFeatures=dict(\n", + " author={101: \"Banks Jr.\", 102: \"Banks Sr.\"},\n", + " lemma={n: 1000 + n for n in range(1, 10)},\n", + " small={n: chr(ord(\"a\") + n % 26) for n in range(1, 10)},\n", + " big={n: chr(ord(\"A\") + n % 26) for n in range(1, 10)},\n", + " ),\n", + " edgeFeatures=dict(\n", + " link={n: {n + i for i in range(1, 3)} for n in range(1, 10)},\n", + " similarity={\n", + " n: {n + i: chr(ord(\"a\") + (i + n) % 26) for i in range(1, 3)}\n", + " for n in range(1, 10)\n", + " },\n", + " ),\n", + " ),\n", + " featureMeta=dict(\n", + " lemma=dict(\n", + " valueType=\"str\",\n", + " ),\n", + " big=dict(\n", + " valueType=\"int\",\n", + " ),\n", + " ),\n", + " silent=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We get away with `lemma` as string, because everything that is written is also a string.\n", + "But not all values of `big` are numbers, so: complaint.\n", + "\n", + "Let's stick to the default:" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test = \"add.f\"\n", + "output = f\"{MODIFIED}.{test}\"\n", + "\n", + "!rm -rf {output}\n", + "\n", + "modify(\n", + " location,\n", + " output,\n", + " addFeatures=dict(\n", + " nodeFeatures=dict(\n", + " author={101: \"Banks Jr.\", 102: \"Banks Sr.\"},\n", + " lemma={n: 1000 + n for n in range(1, 10)},\n", + " small={n: chr(ord(\"a\") + n % 26) for n in range(1, 10)},\n", + " big={n: chr(ord(\"A\") + n % 26) for n in range(1, 10)},\n", + " ),\n", + " edgeFeatures=dict(\n", + " link={n: {n + i for i in range(1, 3)} for n in range(1, 10)},\n", + " similarity={\n", + " n: {n + i: chr(ord(\"a\") + (i + n) % 26) for i in range(1, 3)}\n", + " for n in range(1, 10)\n", + " },\n", + " ),\n", + " ),\n", + " silent=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Result" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "TF = Fabric(locations=f\"{MODIFIED}.{test}\", silent=True)\n", + "api = TF.loadAll(silent=True)\n", + "docs = api.makeAvailableIn(globals())" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['author',\n", + " 'big',\n", + " 'gap',\n", + " 'lemma',\n", + " 'letters',\n", + " 'number',\n", + " 'otype',\n", + " 'punc',\n", + " 'small',\n", + " 'terminator',\n", + " 'title']" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Fall()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['link', 'oslots', 'similarity']" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Eall()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see the extra features, and let's just enumerate their mappings.\n", + "\n", + "`link` is an edge feature where edges do not have values.\n", + "So for each `n`, the result is a set of nodes." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_items([(1, frozenset({2, 3})), (2, frozenset({3, 4})), (3, frozenset({4, 5})), (4, frozenset({5, 6})), (5, frozenset({6, 7})), (6, frozenset({8, 7})), (7, frozenset({8, 9})), (8, frozenset({9, 10})), (9, frozenset({10, 11}))])" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "E.link.items()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`similarity` assigns values to the edges. So for each `n`, the result is a mapping from nodes to values." + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_items([(1, {2: 'c', 3: 'd'}), (2, {3: 'd', 4: 'e'}), (3, {4: 'e', 5: 'f'}), (4, {5: 'f', 6: 'g'}), (5, {6: 'g', 7: 'h'}), (6, {7: 'h', 8: 'i'}), (7, {8: 'i', 9: 'j'}), (8, {9: 'j', 10: 'k'}), (9, {10: 'k', 11: 'l'})])" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "E.similarity.items()" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((2, 'c'), (3, 'd'))" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "E.similarity.f(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now the node features." + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_items([(100, 'Iain M. Banks'), (101, 'Banks Jr.'), (102, 'Banks Sr.')])" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "F.author.items()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_items([(1, 'b'), (2, 'c'), (3, 'd'), (4, 'e'), (5, 'f'), (6, 'g'), (7, 'h'), (8, 'i'), (9, 'j')])" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "F.small.items()" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_items([(1, 'B'), (2, 'C'), (3, 'D'), (4, 'E'), (5, 'F'), (6, 'G'), (7, 'H'), (8, 'I'), (9, 'J')])" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "F.big.items()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_items([(1, 1001), (2, 1002), (3, 1003), (4, 1004), (5, 1005), (6, 1006), (7, 1007), (8, 1008), (9, 1009)])" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "F.lemma.items()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Merge types\n", + "\n", + "Manipulating features is relatively easy. But when we fiddle with the node types, we need our wits about us.\n", + "\n", + "In this example, we first do a feature merge of `title` and `number` into `nm`.\n", + "\n", + "Then we merge the `line` and `sentence` types into a new type `rule`.\n", + "\n", + "And `book` and `chapter` will merge into `section`.\n", + "\n", + "We adapt our section structure so that it makes use of the new features and types." + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test = \"merge.t\"\n", + "output = f\"{MODIFIED}.{test}\"\n", + "\n", + "!rm -rf {output}\n", + "\n", + "modify(\n", + " location,\n", + " output,\n", + " mergeFeatures=dict(nm=\"title number\"),\n", + " mergeTypes=dict(\n", + " rule=dict(\n", + " line=dict(\n", + " type=\"line\",\n", + " ),\n", + " sentence=dict(\n", + " type=\"sentence\",\n", + " ),\n", + " ),\n", + " section=dict(\n", + " book=dict(\n", + " type=\"book\",\n", + " ),\n", + " chapter=dict(\n", + " type=\"chapter\",\n", + " ),\n", + " ),\n", + " ),\n", + " featureMeta=dict(\n", + " otext=dict(\n", + " sectionTypes=\"section,rule\",\n", + " sectionFeatures=\"nm,nm\",\n", + " structureTypes=\"section\",\n", + " structureFeatures=\"nm\",\n", + " ),\n", + " ),\n", + " silent=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Result" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "TF = Fabric(locations=f\"{MODIFIED}.{test}\", silent=True)\n", + "api = TF.loadAll(silent=True)\n", + "docs = api.makeAvailableIn(globals())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We expect a severy reduced inventory of node types:" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(('section', 66.0, 100, 102),\n", + " ('rule', 12.733333333333333, 103, 117),\n", + " ('word', 1, 1, 99))" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "C.levels.data" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['author', 'gap', 'letters', 'nm', 'otype', 'punc', 'terminator', 'type']" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Fall()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete types\n", + "\n", + "We delete the `line` and `sentence` types." + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " | Missing for text API: types: line, sentence\n" + ] + }, + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test = \"delete.t\"\n", + "output = f\"{MODIFIED}.{test}\"\n", + "\n", + "!rm -rf {output}\n", + "\n", + "modify(\n", + " location,\n", + " output,\n", + " deleteTypes=\"sentence line\",\n", + " silent=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "But, again, we can't do that because they are important for the text API.\n", + "\n", + "This time, we change the text API, so that it does not need them anymore." + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test = \"delete.t\"\n", + "output = f\"{MODIFIED}.{test}\"\n", + "\n", + "modify(\n", + " location,\n", + " output,\n", + " deleteTypes=\"sentence line\",\n", + " featureMeta=dict(\n", + " otext=dict(\n", + " sectionTypes=\"book,chapter\",\n", + " sectionFeatures=\"title,number\",\n", + " structureTypes=\"book,chapter\",\n", + " structureFeatures=\"title,number\",\n", + " ),\n", + " ),\n", + " silent=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Result" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "TF = Fabric(locations=f\"{MODIFIED}.{test}\", silent=True)\n", + "api = TF.loadAll(silent=True)\n", + "docs = api.makeAvailableIn(globals())" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(('book', 99.0, 100, 100), ('chapter', 49.5, 101, 102), ('word', 1, 1, 99))" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "C.levels.data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As expected." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Add types\n", + "\n", + "Adding types involves a lot of data, because we do not only add nodes, but also features about those nodes.\n", + "\n", + "The idea is this:\n", + "\n", + "Suppose that somewhere in another dataset, you have found lexeme nodes for the words in your data set.\n", + "\n", + "You just take those lexeme features, which may range from 100,000 to 110,000 say, and you find a way to map them to your\n", + "words, by means of a map `nodeSlots`.\n", + "\n", + "Then you can just grab those lexeme functions *as they are*, and pack them into the `addTypes` argument,\n", + "together with the `nodeSlots` and the node boundaries (100,000 - 110,000).\n", + "\n", + "The new feature data is not able to say something about nodes in the input data set, because the new nodes will be shifted\n", + "so that they are past the `maxNode` of your input data set.\n", + "And if your feature data accidentally addresses nodes outside the declared range, those assignments will be ignored.\n", + "\n", + "So all in all, it is a rather clean addition of material.\n", + "\n", + "Maybe a bit too clean, because it is also impossible to add edge features that link the new nodes to the old nodes.\n", + "But then, it would be devilishly hard to make sure that after the necessary remapping of the edge features,\n", + "they address the intended nodes.\n", + "\n", + "If you do want edge features between old and new nodes, it is better to compute them in the new dataset and add them\n", + "as an individual feature or by another call to `modify()`.\n", + "\n", + "Let's have a look at an example where we add a type `bis` consisting of a few bigrams, and a type `tris`,\n", + "consisting of a bunch `trigrams`.\n", + "\n", + "We just furnish a slot mapping for those nodes, and give them a `name` feature." + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test = \"add.t\"\n", + "output = f\"{MODIFIED}.{test}\"\n", + "\n", + "!rm -rf {output}\n", + "\n", + "modify(\n", + " location,\n", + " output,\n", + " addTypes=dict(\n", + " bis=dict(\n", + " nodeFrom=1,\n", + " nodeTo=5,\n", + " nodeSlots={\n", + " 1: {10, 11},\n", + " 2: {20, 21},\n", + " 3: {30, 31},\n", + " 4: {40, 41},\n", + " 5: {50, 51},\n", + " },\n", + " nodeFeatures=dict(\n", + " name={\n", + " 1: \"b1\",\n", + " 2: \"b2\",\n", + " 3: \"b3\",\n", + " 4: \"b4\",\n", + " 5: \"b5\",\n", + " },\n", + " ),\n", + " edgeFeatures=dict(\n", + " link={\n", + " 1: {2: 100, 3: 50, 4: 25},\n", + " 2: {3: 50, 4: 25, 5: 12},\n", + " 3: {4: 25, 5: 12},\n", + " 4: {5: 12, 1: 6},\n", + " 5: {1: 6, 2: 3, 4: 1},\n", + " },\n", + " ),\n", + " ),\n", + " tris=dict(\n", + " nodeFrom=1,\n", + " nodeTo=4,\n", + " nodeSlots={\n", + " 1: {60, 61, 62},\n", + " 2: {70, 71, 72},\n", + " 3: {80, 81, 82},\n", + " 4: {90, 91, 94},\n", + " },\n", + " nodeFeatures=dict(\n", + " name={\n", + " 1: \"tr1\",\n", + " 2: \"tr2\",\n", + " 3: \"tr3\",\n", + " 4: \"tr4\",\n", + " },\n", + " ),\n", + " edgeFeatures=dict(\n", + " sim={\n", + " 1: {2, 3, 4},\n", + " 2: {3, 4},\n", + " 3: {4},\n", + " 4: {5, 1},\n", + " },\n", + " ),\n", + " ),\n", + " ),\n", + " silent=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Result" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "TF = Fabric(locations=f\"{MODIFIED}.{test}\", silent=True)\n", + "api = TF.loadAll(silent=True)\n", + "docs = api.makeAvailableIn(globals())" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(('book', 99.0, 100, 100),\n", + " ('chapter', 49.5, 101, 102),\n", + " ('sentence', 33.0, 115, 117),\n", + " ('line', 7.666666666666667, 103, 114),\n", + " ('tris', 3.0, 123, 126),\n", + " ('bis', 2.0, 118, 122),\n", + " ('word', 1, 1, 99))" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "C.levels.data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are the `bis` and `tris`!" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['author',\n", + " 'gap',\n", + " 'letters',\n", + " 'name',\n", + " 'number',\n", + " 'otype',\n", + " 'punc',\n", + " 'terminator',\n", + " 'title']" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Fall()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And there is the new feature `name`:" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(118, 'b1'),\n", + " (119, 'b2'),\n", + " (120, 'b3'),\n", + " (121, 'b4'),\n", + " (122, 'b5'),\n", + " (123, 'tr1'),\n", + " (124, 'tr2'),\n", + " (125, 'tr3'),\n", + " (126, 'tr4')]" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sorted(F.name.items())" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['link', 'oslots', 'sim']" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Eall()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And the new edge features `link` and `sim`:" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(118, {121: '25', 120: '50', 119: '100'}),\n", + " (119, {122: '12', 121: '25', 120: '50'}),\n", + " (120, {122: '12', 121: '25'}),\n", + " (121, {118: '6', 122: '12'}),\n", + " (122, {121: '1', 119: '3', 118: '6'})]" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sorted(E.link.items())" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(123, frozenset({124, 125, 126})),\n", + " (124, frozenset({125, 126})),\n", + " (125, frozenset({126})),\n", + " (126, frozenset({123}))]" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sorted(E.sim.items())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And that is all for now.\n", + "\n", + "Incredible that you made it till here!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "All chapters:\n", + "\n", + "* [use](use.ipynb)\n", + "* [share](share.ipynb)\n", + "* [app](app.ipynb)\n", + "* [repo](repo.ipynb)\n", + "* *compose*\n", + "\n", + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + }, + "toc-autonumbering": false, + "toc-showtags": false, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tutorial/examples/banks/tf/0.1/author.tf b/tutorial/examples/banks/tf/0.1/author.tf new file mode 100644 index 0000000..3a2be8d --- /dev/null +++ b/tutorial/examples/banks/tf/0.1/author.tf @@ -0,0 +1,12 @@ +@node +@compiler=Dirk Roorda +@description=the author of a book +@name=Culture quotes from Iain Banks +@source=Good Reads +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@writtenBy=Text-Fabric +@dateWritten=2019-04-08T08:32:08Z +@version=0.1 + +100 Iain M. Banks diff --git a/tutorial/examples/banks/tf/0.1/gap.tf b/tutorial/examples/banks/tf/0.1/gap.tf new file mode 100644 index 0000000..83fcab9 --- /dev/null +++ b/tutorial/examples/banks/tf/0.1/gap.tf @@ -0,0 +1,17 @@ +@node +@compiler=Dirk Roorda +@description=1 for words that occur between [ ], which are inserted by the editor +@name=Culture quotes from Iain Banks +@source=Good Reads +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=int +@writtenBy=Text-Fabric +@dateWritten=2019-04-08T08:32:08Z + +10 1 +1 +1 +1 +78 1 +1 +1 diff --git a/tutorial/examples/banks/tf/0.1/letters.tf b/tutorial/examples/banks/tf/0.1/letters.tf new file mode 100644 index 0000000..c1f5cc3 --- /dev/null +++ b/tutorial/examples/banks/tf/0.1/letters.tf @@ -0,0 +1,109 @@ +@node +@compiler=Dirk Roorda +@description=the letters of a word +@name=Culture quotes from Iain Banks +@source=Good Reads +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@writtenBy=Text-Fabric +@dateWritten=2019-04-08T08:32:08Z + +Everything +about +us +everything +around +us +everything +we +know +and +can +know +of +is +composed +ultimately +of +patterns +of +nothing +that’s +the +bottom +line +the +final +truth +So +where +we +find +we +have +any +control +over +those +patterns +why +not +make +the +most +elegant +ones +the +most +enjoyable +and +good +ones +in +our +own +terms +Besides +it +left +the +humans +in +the +Culture +free +to +take +care +of +the +things +that +really +mattered +in +life +such +as +sports +games +romance +studying +dead +languages +barbarian +societies +and +impossible +problems +and +climbing +high +mountains +without +the +aid +of +a +safety +harness diff --git a/tutorial/examples/banks/tf/0.1/number.tf b/tutorial/examples/banks/tf/0.1/number.tf new file mode 100644 index 0000000..19565dc --- /dev/null +++ b/tutorial/examples/banks/tf/0.1/number.tf @@ -0,0 +1,27 @@ +@node +@compiler=Dirk Roorda +@description=number of chapter, or sentence in chapter, or line in sentence +@name=Culture quotes from Iain Banks +@source=Good Reads +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=int +@writtenBy=Text-Fabric +@dateWritten=2019-04-08T08:32:08Z + +101 1 +2 +1 +2 +3 +4 +6 +7 +8 +1 +2 +3 +4 +5 +1 +2 +1 diff --git a/tutorial/examples/banks/tf/0.1/oslots.tf b/tutorial/examples/banks/tf/0.1/oslots.tf new file mode 100644 index 0000000..4e7b3d8 --- /dev/null +++ b/tutorial/examples/banks/tf/0.1/oslots.tf @@ -0,0 +1,27 @@ +@edge +@compiler=Dirk Roorda +@name=Culture quotes from Iain Banks +@source=Good Reads +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@writtenBy=Text-Fabric +@dateWritten=2019-04-08T08:32:08Z + +100 1-99 +1-55 +56-99 +1-3 +4-6 +7-9,14-20 +21-27 +28-38 +39-51 +52-55 +56 +57-75 +76-77,81-83 +84-88 +89-99 +1-27 +28-55 +56-99 diff --git a/tutorial/examples/banks/tf/0.1/otext.tf b/tutorial/examples/banks/tf/0.1/otext.tf new file mode 100644 index 0000000..d12a4ea --- /dev/null +++ b/tutorial/examples/banks/tf/0.1/otext.tf @@ -0,0 +1,11 @@ +@config +@compiler=Dirk Roorda +@fmt:text-orig-full={letters}{punc} +@name=Culture quotes from Iain Banks +@sectionFeatures=title,number,number +@sectionTypes=book,chapter,sentence +@source=Good Reads +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@writtenBy=Text-Fabric +@dateWritten=2019-04-08T08:32:08Z + diff --git a/tutorial/examples/banks/tf/0.1/otype.tf b/tutorial/examples/banks/tf/0.1/otype.tf new file mode 100644 index 0000000..5db9f11 --- /dev/null +++ b/tutorial/examples/banks/tf/0.1/otype.tf @@ -0,0 +1,14 @@ +@node +@compiler=Dirk Roorda +@name=Culture quotes from Iain Banks +@source=Good Reads +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@writtenBy=Text-Fabric +@dateWritten=2019-04-08T08:32:08Z + +1-99 word +100 book +101-102 chapter +103-114 line +115-117 sentence diff --git a/tutorial/examples/banks/tf/0.1/punc.tf b/tutorial/examples/banks/tf/0.1/punc.tf new file mode 100644 index 0000000..52c6b9b --- /dev/null +++ b/tutorial/examples/banks/tf/0.1/punc.tf @@ -0,0 +1,27 @@ +@node +@compiler=Dirk Roorda +@description=the punctuation after a word +@name=Culture quotes from Iain Banks +@source=Good Reads +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@writtenBy=Text-Fabric +@dateWritten=2019-04-08T08:32:08Z + +3 , +6 , +20 ; +24 , +27 . +38 , +45 , +51 , +55 ? +, +75 , +78 , +, +, +83 , +88 , +99 . diff --git a/tutorial/examples/banks/tf/0.1/terminator.tf b/tutorial/examples/banks/tf/0.1/terminator.tf new file mode 100644 index 0000000..2fd99f3 --- /dev/null +++ b/tutorial/examples/banks/tf/0.1/terminator.tf @@ -0,0 +1,22 @@ +@node +@compiler=Dirk Roorda +@description=the last character of a line +@name=Culture quotes from Iain Banks +@source=Good Reads +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@writtenBy=Text-Fabric +@dateWritten=2019-04-08T08:32:08Z + +103 , +, +; +. +, +, +? +, +, +, +, +. diff --git a/tutorial/examples/banks/tf/0.1/title.tf b/tutorial/examples/banks/tf/0.1/title.tf new file mode 100644 index 0000000..51b3872 --- /dev/null +++ b/tutorial/examples/banks/tf/0.1/title.tf @@ -0,0 +1,11 @@ +@node +@compiler=Dirk Roorda +@description=the title of a book +@name=Culture quotes from Iain Banks +@source=Good Reads +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@writtenBy=Text-Fabric +@dateWritten=2019-04-08T08:32:08Z + +100 Consider Phlebas diff --git a/tutorial/examples/banks/tf/0.2/author.tf b/tutorial/examples/banks/tf/0.2/author.tf new file mode 100644 index 0000000..3821f88 --- /dev/null +++ b/tutorial/examples/banks/tf/0.2/author.tf @@ -0,0 +1,12 @@ +@node +@compiler=Dirk Roorda +@description=the author of a book +@name=Culture quotes from Iain Banks +@source=Good Reads +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@writtenBy=Text-Fabric +@dateWritten=2019-04-09T12:04:34Z +@version=0.2 + +100 Iain M. Banks diff --git a/tutorial/examples/banks/tf/0.2/gap.tf b/tutorial/examples/banks/tf/0.2/gap.tf new file mode 100644 index 0000000..7ce493b --- /dev/null +++ b/tutorial/examples/banks/tf/0.2/gap.tf @@ -0,0 +1,17 @@ +@node +@compiler=Dirk Roorda +@description=1 for words that occur between [ ], which are inserted by the editor +@name=Culture quotes from Iain Banks +@source=Good Reads +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=int +@writtenBy=Text-Fabric +@dateWritten=2019-04-09T12:04:34Z + +10 1 +1 +1 +1 +78 1 +1 +1 diff --git a/tutorial/examples/banks/tf/0.2/letters.tf b/tutorial/examples/banks/tf/0.2/letters.tf new file mode 100644 index 0000000..85b45c4 --- /dev/null +++ b/tutorial/examples/banks/tf/0.2/letters.tf @@ -0,0 +1,109 @@ +@node +@compiler=Dirk Roorda +@description=the letters of a word +@name=Culture quotes from Iain Banks +@source=Good Reads +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@writtenBy=Text-Fabric +@dateWritten=2019-04-09T12:04:34Z + +Everything +about +us +everything +around +us +everything +we +know +and +can +know +of +is +composed +ultimately +of +patterns +of +nothing +that’s +the +bottom +line +the +final +truth +So +where +we +find +we +have +any +control +over +those +patterns +why +not +make +the +most +elegant +ones +the +most +enjoyable +and +good +ones +in +our +own +terms +Besides +it +left +the +humans +in +the +Culture +free +to +take +care +of +the +things +that +really +mattered +in +life +such +as +sports +games +romance +studying +dead +languages +barbarian +societies +and +impossible +problems +and +climbing +high +mountains +without +the +aid +of +a +safety +harness diff --git a/tutorial/examples/banks/tf/0.2/number.tf b/tutorial/examples/banks/tf/0.2/number.tf new file mode 100644 index 0000000..7b19be6 --- /dev/null +++ b/tutorial/examples/banks/tf/0.2/number.tf @@ -0,0 +1,27 @@ +@node +@compiler=Dirk Roorda +@description=number of chapter, or sentence in chapter, or line in sentence +@name=Culture quotes from Iain Banks +@source=Good Reads +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=int +@writtenBy=Text-Fabric +@dateWritten=2019-04-09T12:04:34Z + +101 1 +2 +1 +2 +3 +4 +6 +7 +8 +1 +2 +3 +4 +5 +1 +2 +1 diff --git a/tutorial/examples/banks/tf/0.2/oslots.tf b/tutorial/examples/banks/tf/0.2/oslots.tf new file mode 100644 index 0000000..2255eb1 --- /dev/null +++ b/tutorial/examples/banks/tf/0.2/oslots.tf @@ -0,0 +1,27 @@ +@edge +@compiler=Dirk Roorda +@name=Culture quotes from Iain Banks +@source=Good Reads +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@writtenBy=Text-Fabric +@dateWritten=2019-04-09T12:04:34Z + +100 1-99 +1-55 +56-99 +1-3 +4-6 +7-9,14-20 +21-27 +28-38 +39-51 +52-55 +56 +57-75 +76-77,81-83 +84-88 +89-99 +1-27 +28-55 +56-99 diff --git a/tutorial/examples/banks/tf/0.2/otext.tf b/tutorial/examples/banks/tf/0.2/otext.tf new file mode 100644 index 0000000..c284e49 --- /dev/null +++ b/tutorial/examples/banks/tf/0.2/otext.tf @@ -0,0 +1,11 @@ +@config +@compiler=Dirk Roorda +@fmt:text-orig-full={letters}{punc} +@name=Culture quotes from Iain Banks +@sectionFeatures=title,number,number +@sectionTypes=book,chapter,sentence +@source=Good Reads +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@writtenBy=Text-Fabric +@dateWritten=2019-04-09T12:04:34Z + diff --git a/tutorial/examples/banks/tf/0.2/otype.tf b/tutorial/examples/banks/tf/0.2/otype.tf new file mode 100644 index 0000000..72b330b --- /dev/null +++ b/tutorial/examples/banks/tf/0.2/otype.tf @@ -0,0 +1,14 @@ +@node +@compiler=Dirk Roorda +@name=Culture quotes from Iain Banks +@source=Good Reads +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@writtenBy=Text-Fabric +@dateWritten=2019-04-09T12:04:34Z + +1-99 word +100 book +101-102 chapter +103-114 line +115-117 sentence diff --git a/tutorial/examples/banks/tf/0.2/punc.tf b/tutorial/examples/banks/tf/0.2/punc.tf new file mode 100644 index 0000000..bc0ec5e --- /dev/null +++ b/tutorial/examples/banks/tf/0.2/punc.tf @@ -0,0 +1,27 @@ +@node +@compiler=Dirk Roorda +@description=the punctuation after a word +@name=Culture quotes from Iain Banks +@source=Good Reads +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@writtenBy=Text-Fabric +@dateWritten=2019-04-09T12:04:34Z + +3 , +6 , +20 ; +24 , +27 . +38 , +45 , +51 , +55 ? +, +75 , +78 , +, +, +83 , +88 , +99 . diff --git a/tutorial/examples/banks/tf/0.2/terminator.tf b/tutorial/examples/banks/tf/0.2/terminator.tf new file mode 100644 index 0000000..e073c00 --- /dev/null +++ b/tutorial/examples/banks/tf/0.2/terminator.tf @@ -0,0 +1,22 @@ +@node +@compiler=Dirk Roorda +@description=the last character of a line +@name=Culture quotes from Iain Banks +@source=Good Reads +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@writtenBy=Text-Fabric +@dateWritten=2019-04-09T12:04:34Z + +103 , +, +; +. +, +, +? +, +, +, +, +. diff --git a/tutorial/examples/banks/tf/0.2/title.tf b/tutorial/examples/banks/tf/0.2/title.tf new file mode 100644 index 0000000..e32c4c5 --- /dev/null +++ b/tutorial/examples/banks/tf/0.2/title.tf @@ -0,0 +1,11 @@ +@node +@compiler=Dirk Roorda +@description=the title of a book +@name=Culture quotes from Iain Banks +@source=Good Reads +@url=https://www.goodreads.com/work/quotes/14366-consider-phlebas +@valueType=str +@writtenBy=Text-Fabric +@dateWritten=2019-04-09T12:04:34Z + +100 Consider Phlebas diff --git a/tutorial/examples/bankssim/tf/0.1/sim.tf b/tutorial/examples/bankssim/tf/0.1/sim.tf new file mode 100644 index 0000000..a5b5da4 --- /dev/null +++ b/tutorial/examples/bankssim/tf/0.1/sim.tf @@ -0,0 +1,1101 @@ +@edge +@edgeValues +@converters=Dirk Roorda +@description=similarity between ayas, as a percentage of the common material wrt the combined material +@name=Banks (similar words) +@sourceUrl=https://nbviewer.jupyter.org/github/annotation/tutorials/blob/master/text-fabric/use.ipynb +@valueType=int +@writtenBy=Text-Fabric +@dateWritten=2019-04-08T17:38:06Z + +15 7 +1 2,9,12,23,41,43,47,76 8 +1 10-11,49-50,53-54,82,86,89,95 9 +1 8,14,30,32,65 10 +1 87-88 13 +1 5,60 15 +1 21,26,56,78-79 17 +1 31,45,51,58,66-67,75 18 +1 34,39-40,64,71 20 +1 48,92 21 +1 52,57,61,74 22 +1 80,83,90 23 +1 35,63,73,85,93,98 25 +1 37,55,72,84 27 +1 24,27,29,33,36 30 +1 16 31 +1 18,22,25,38,42,46,59,62,69,91,94 33 +1 44,99 36 +1 81 42 +1 20,70 50 +1 4,7 100 +4,7 8 +2 15,90 9 +2 70,99 10 +2 26,55-56,72,79 11 +2 9,12,33,36,41,45,51,58,67,76 12 +2 10-11,22,25,34,42,46,49-50,54,59,62,69,82,86,89,94-95 14 +2 3,6,13,17,19,28,57,68,77,96 17 +2 81,87-88 18 +2 18,38,80,83,97 20 +2 20,35,44,60,63,73,85,98 22 +2 21,37,78,84 25 +2 27,43,47,66 29 +2 16,48 30 +2 40,53,71 33 +2 5,93 38 +2 65 40 +2 92 44 +2 23 50 +16,87-88 11 +3 15,18,38 12 +3 5,63,70,85,93,98-99 14 +3 21,37,55-56,78-79 17 +3 27,43,45,47,51 20 +3 53,81,92 25 +3 83 29 +3 14,28,60,77 33 +3 76 50 +3 6 100 +15 7 +4 9,12,23,41,43,47,76 8 +4 10-11,49-50,53-54,82,86,89,95 9 +4 8,14,30,32,65 10 +4 87-88 13 +4 5,60 15 +4 21,26,56,78-79 17 +4 31,45,51,58,66-67,75 18 +4 34,39-40,64,71 20 +4 48,92 21 +4 52,57,61,74 22 +4 80,83,90 23 +4 35,63,73,85,93,98 25 +4 37,55,72,84 27 +4 24,27,29,33,36 30 +4 16 31 +4 18,22,25,38,42,46,59,62,69,91,94 33 +4 44,99 36 +4 81 42 +4 20,70 50 +4 7 100 +87,90 8 +5 70,85,98 9 +5 21,37,55-56,79 10 +5 23-24,29,33,41,43,47,66,76 11 +5 64,71 12 +5 6,13,17,19,28,52,61,65,68,74,77,96 14 +5 7 15 +5 16,88,97 17 +5 15 18 +5 20,44,63,93 20 +5 26,72,78 22 +5 9,12,27,31,36,45,51,67 25 +5 48,81 27 +5 11,34,40,50,54,82,95 29 +5 18,38,83 30 +5 35,60,73,99 33 +5 84 38 +5 92 40 +5 80 44 +5 10,49,53,86,89 50 +16,87-88 11 +6 15,18,38 12 +6 63,70,85,93,98-99 14 +6 21,37,55-56,78-79 17 +6 27,43,45,47,51 20 +6 53,81,92 25 +6 83 29 +6 14,28,60,77 33 +6 76 50 +15 7 +7 9,12,23,41,43,47,76 8 +7 10-11,49-50,53-54,82,86,89,95 9 +7 8,14,30,32,65 10 +7 87-88 13 +7 60 15 +7 21,26,56,78-79 17 +7 31,45,51,58,66-67,75 18 +7 34,39-40,64,71 20 +7 48,92 21 +7 52,57,61,74 22 +7 80,83,90 23 +7 35,63,73,85,93,98 25 +7 37,55,72,84 27 +7 24,27,29,33,36 30 +7 16 31 +7 18,22,25,38,42,46,59,62,69,91,94 33 +7 44,99 36 +7 81 42 +7 20,70 50 +16,48,87-88 11 +8 15,18,38,80,83 12 +8 44,63,73,85,93,98-99 14 +8 37,55-56,72,79 17 +8 9,12,24,33,36,41,45,51,58,66-67,75 20 +8 22,25,39,42,46,54,59,62,64,69,82,94 25 +8 29 50 +8 30,32 100 +81,87-88 9 +9 15,18,38,83,90 10 +9 44,60,70,85,99 11 +9 26,37,78,84 12 +9 23-24,29,31,36,41,43,47,66 14 +9 10-11,34,39,49-50,53,86,89 17 +9 13,17,19,28,30,32,48,52,61,65,68,74,92,96 20 +9 80 22 +9 20,35,93 25 +9 45,51 33 +9 40 40 +9 54 75 +9 12 100 +16 10 +10 15,90 11 +10 20,35,70,98 12 +10 21,56,72,79 14 +10 12,24,33,41,45,51,66-67 17 +10 40,50,54,71 20 +10 48,81,92 22 +10 18,38,52,61,74,77,80,83 25 +10 44,60,73,99 29 +10 26,84,97 33 +10 31 40 +10 11,34,82,95 50 +10 49,86,89 100 +16,81 10 +11 15 11 +11 20,63,70,73,85,98 12 +11 21,72,79 14 +11 12,24,31,33,41,45,51,66,76 17 +11 40,54,71,82,95 20 +11 48,92 22 +11 18,38,52,61,74,77,83,90 25 +11 35,44,60,99 29 +11 26,84,97 33 +11 67 40 +11 80 43 +11 34,49,86,89 50 +81,87-88 9 +12 15,18,38,83,90 10 +12 44,60,70,85,99 11 +12 26,37,78,84 12 +12 23-24,29,31,36,41,43,47,66 14 +12 34,39,49-50,53,86,89 17 +12 13,17,19,28,30,32,48,52,61,65,68,74,92,96 20 +12 80 22 +12 20,35,93 25 +12 45,51 33 +12 40 40 +12 54 75 +48,87-88,92 11 +13 15,80 12 +13 20,35,85,93,98 14 +13 26,37,78 17 +13 23,31,36,43,45,47,51,58,75 20 +13 40,50,53-54,64 25 +13 28,65 33 +13 17,19,68,96 100 +16,88 11 +14 15,18,38,83,90 12 +14 20,60,93,98-99 14 +14 21,26,37,55,78-79,84 17 +14 24,31,43,45,47,51,75-76 20 +14 81,87,91-92,95 25 +14 28,52,57,61,70,74,77,85 33 +14 56 40 +20,44,70,93 8 +15 21,72 9 +15 24,29,31,33,58,66,75 10 +15 22,25,40,42,46,49,53-54,59,62,64,69,86,89,94-95 11 +15 17,19,30,32,65,68,77,96 12 +15 16,48,81 15 +15 83,90 17 +15 35,60,63,98-99 18 +15 23,36,41,67,76 22 +15 50,82,92 25 +15 18,38 27 +15 28 29 +15 73 30 +15 37,55-56,78-79 33 +15 43,45,47,51 38 +15 80 40 +15 85 44 +15 87-88 50 +78 8 +16 29,31,36,45,51,76 9 +16 39-40,49,53,64,86,89,91 10 +16 30,32,52,61,65,74,77 11 +16 97 12 +16 20,35,70,99 17 +16 21,37,56,84 18 +16 23,27,33,43,47,67 20 +16 22,25,34,42,46,59,62,69,71,82,94-95 22 +16 88 23 +16 18,38,57,80,90 25 +16 60,85,93 27 +16 26,55,79 30 +16 24,41,48,58,66,75,81,87 33 +16 83 36 +16 44,63,73,98 40 +16 72 44 +16 92 45 +48,87-88,92 11 +17 80 12 +17 20,35,85,93,98 14 +17 26,37,78 17 +17 23,31,36,43,45,47,51,58,75 20 +17 40,50,53-54,64 25 +17 28,65 33 +17 19,68,96 100 +90,93 8 +18 23,31,75-76 10 +18 53-54,95 11 +18 28,30,32,52,57,61,65,74 12 +18 97 14 +18 20 18 +18 26,56 20 +18 24,27,29,33,36,41,43,47,58 22 +18 22,25,34,40,42,46,48-49,59,62,64,69,71,81-82,86-87,89,94 25 +18 77 29 +18 35,60,63,70,85 30 +18 21,37,72,79,84 33 +18 88,92 36 +18 45,51,66-67 38 +18 80,83 40 +18 44,73,98 44 +18 55,78 50 +18 99 62 +18 38 100 +48,87-88,92 11 +19 80 12 +19 20,35,85,93,98 14 +19 26,37,78 17 +19 23,31,36,43,45,47,51,58,75 20 +19 40,50,53-54,64 25 +19 28,65 33 +19 68,96 100 +88 8 +20 63,73,98 9 +20 55-56,79 10 +20 29,33,36,58,66,75-76 11 +20 34,39,49,53,86,89,95 12 +20 28,68,96 14 +20 48,87 17 +20 38,80,83 18 +20 60,99 20 +20 21,26,78,84 22 +20 23-24,27,31,43,45,47,51 25 +20 22,25,42,46,50,54,59,62,69,71,94 29 +20 90 30 +20 35,44,52,57,61,65,74,85 33 +20 37 38 +20 81,92 40 +20 40,91,93 50 +20 70 71 +48,87-88 8 +21 80 9 +21 35,63 10 +21 26,56,72,84 11 +21 23,29,41,45,51,58,67 12 +21 34,39-40,49,82,86,89,91,95 14 +21 28,57,65 17 +21 81 18 +21 83,97 20 +21 44,73,85,93 22 +21 55,78-79 25 +21 27,33,43,47,66,76 29 +21 92 30 +21 22,25,38,42,46,59,62,69,94 33 +21 60,70,98-99 38 +21 77 40 +21 37 43 +21 71 60 +48,81,87-88,92 10 +22 80,83 11 +22 35,60 12 +22 56,72,78-79 14 +22 23-24,36,41,43,45,47,51,67,75-76 17 +22 39-40,64,82,91 20 +22 30,32,38,57,65 25 +22 44,63,70,73,85,93,98-99 29 +22 55 33 +22 27,29,33,58,66 40 +22 71 50 +22 37 60 +22 25,42,46,59,62,69,94 100 +81 9 +23 38 10 +23 44,60,63,70,98 11 +23 56,79,84 12 +23 27,36,41,45,51,58,66 14 +23 25,42,46,50,53-54,59,62,69,71,94 17 +23 28,48,57,68,96 20 +23 80,90 22 +23 35,73,85,93 25 +23 37,55,78 29 +23 87-88,92 33 +23 40 40 +23 65 50 +23 43,47 60 +60,73,93,98 11 +24 37,55,79 12 +24 29,33,36,41,66-67 14 +24 25,34,40,42,46,49,54,59,62,64,69,82,86,89,91,94-95 17 +24 30,32,57,81,88,92 20 +24 38,80 22 +24 35,63,70,85,99 25 +24 56,72,84 29 +24 31,45,48,51,58,87 33 +24 83,90 38 +24 44 43 +24 26,52,61,74 50 +24 75 60 +48,81,87-88,92 10 +25 80,83 11 +25 35,60 12 +25 56,72,78-79 14 +25 36,41,43,45,47,51,67,75-76 17 +25 39-40,64,82,91 20 +25 30,32,38,57,65 25 +25 44,63,70,73,85,93,98-99 29 +25 55 33 +25 27,29,33,58,66 40 +25 71 50 +25 37 60 +25 42,46,59,62,69,94 100 +88 8 +26 63,73,85,93 10 +26 56,79 11 +26 33,41,45,51,66-67 12 +26 40,54,64,71,82,91 14 +26 57,68,77,96 17 +26 81,87 18 +26 38,80,97 20 +26 35,60,70,98-99 22 +26 72 25 +26 58 29 +26 48,92 30 +26 34,49,83,86,89-90,95 33 +26 44 38 +26 52,61,74 40 +26 84 43 +26 31,75 50 +88 9 +27 80,83 10 +27 44,85,98 11 +27 72,84 12 +27 33,36,43,47,58,66-67 14 +27 39-40,64,91 17 +27 57,65,81,92 20 +27 38 22 +27 35,60,70,73,99 25 +27 37,55,78 29 +27 29,76 33 +27 42,46,53,59,62,69,71,94 40 +27 63,93 43 +48,81 11 +28 38,80,83 12 +28 35,60,70,93,98-99 14 +28 55-56,79 17 +28 36,76 20 +28 40,50,53-54,87-88,92 25 +28 65,68,77,85,96 33 +28 37,78 40 +28 43,45,47,51 50 +48,87 9 +29 83 10 +29 35,44,60,70,85,98 11 +29 56,78-79,84 12 +29 41,45,51,58,66,75-76 14 +29 53-54,71,82,91 17 +29 88 20 +29 38,80 22 +29 63,73,93 25 +29 37,55,72 29 +29 33,36,67 33 +29 39,42,46,59,62,64,69,94 40 +29 99 43 +29 30,32 50 +48,87-88 11 +30 38,80,83 12 +30 44,63,73,85,93,98-99 14 +30 37,55-56,72,79 17 +30 33,36,41,45,51,58,66-67,75 20 +30 39,42,46,54,59,62,64,69,82,94 25 +30 32 100 +48,87 9 +31 38,80,83 10 +31 35,44,60,73,85,93,98-99 11 +31 45,51,58 14 +31 34,40,50,54,64,82,91 17 +31 57,68,92,96 20 +31 90 22 +31 70 25 +31 56,84 29 +31 75,81 33 +31 49,86,89,95 40 +31 52,61,74 50 +48,87-88 11 +32 38,80,83 12 +32 44,63,73,85,93,98-99 14 +32 37,55-56,72,79 17 +32 33,36,41,45,51,58,66-67,75 20 +32 39,42,46,54,59,62,64,69,82,94 25 +87-88,92 9 +33 63,70,85,93 11 +33 55-56,84 12 +33 45,51,58,75-76 14 +33 34,39,49,64,86,89,91,95 17 +33 48,77 20 +33 38,80,83 22 +33 44,60,73,97-98 25 +33 37,72,79 29 +33 36,41,66-67 33 +33 42,46,59,62,69,71,82,94 40 +33 99 43 +90 11 +34 35,70,73 12 +34 79 14 +34 41,45,51,66-67 17 +34 39-40,54,71,82,95 20 +34 81,92 22 +34 38,52,61,74,77,80,83 25 +34 44,60,98-99 29 +34 72,84,97 33 +34 48 38 +34 49,86,89 50 +60,98 9 +35 66,75-76 11 +35 42,46,49-50,59,62,64,69,71,86,89,94 12 +35 52,57,61,68,74,96 14 +35 81,87 17 +35 83 18 +35 70,73,93,99 20 +35 37,55,72,84 22 +35 36,43,45,47,51,58,67 25 +35 48,88,92 27 +35 53-54 29 +35 38,90 30 +35 44,65,85 33 +35 78 38 +35 80 44 +35 40,63 50 +92 9 +36 83 10 +36 44,93,98 11 +36 56,79,84 12 +36 41,43,47,58,66,75 14 +36 40,42,46,50,54,59,62,69,82,94 17 +36 48,65,68,87,96 20 +36 38 22 +36 63,73,85,99 25 +36 37,55,72,78 29 +36 45,51,67,88 33 +36 80 38 +36 53,64 40 +72 11 +37 41,67,75 12 +37 39,50,53-54,64,82,91 14 +37 57,68,77,96 17 +37 48,81 18 +37 80,83 20 +37 44,60,63,73 22 +37 56,79 25 +37 58,66,76 29 +37 87-88,92 30 +37 38,40,71 33 +37 70,93,98-99 38 +37 65 40 +37 55,78 43 +37 43,45,47,51 50 +37 85 57 +37 42,46,59,62,69,94 60 +90,93 8 +38 75-76 10 +38 53-54,95 11 +38 52,57,61,65,74 12 +38 97 14 +38 56 20 +38 41,43,47,58 22 +38 40,42,46,48-49,59,62,64,69,71,81-82,86-87,89,94 25 +38 77 29 +38 60,63,70,85 30 +38 72,79,84 33 +38 88,92 36 +38 45,51,66-67 38 +38 80,83 40 +38 44,73,98 44 +38 55,78 50 +38 99 62 +48,81 10 +39 60,70,98-99 12 +39 72 14 +39 76 17 +39 42,46,54,59,62,69,71,91,94 20 +39 93 29 +87-88 10 +40 83,90 11 +40 60,63,73,98-99 12 +40 55,84 14 +40 58,66 17 +40 42,46,49-50,53,59,62,69,71,86,89,94 20 +40 48,81 22 +40 52,57,61,68,74,80,96 25 +40 44,70,85,93 29 +40 78 33 +40 92 38 +40 43,45,47,51 40 +40 54 50 +40 65 67 +90 10 +41 63,85 11 +41 56,84 12 +41 43,45,47,51,58,75 14 +41 42,46,49,59,62,64,69,71,86,89,94-95 17 +41 48,77,87-88,92 20 +41 83 22 +41 44,60,97-99 25 +41 55,72 29 +41 67 33 +41 80 38 +41 82 40 +41 73 43 +41 79 50 +41 66 60 +48,81,87-88,92 10 +42 80,83 11 +42 60 12 +42 56,72,78-79 14 +42 43,45,47,51,67,75-76 17 +42 64,82,91 20 +42 57,65 25 +42 44,63,70,73,85,93,98-99 29 +42 55 33 +42 58,66 40 +42 71 50 +42 46,59,62,69,94 100 +48 9 +43 83,90 10 +43 44,63,99 11 +43 56 12 +43 58,66,76 14 +43 46,50,53-54,59,62,69,71,94 17 +43 57,68,77,81,96 20 +43 80 22 +43 60,70,73,93,98 25 +43 79 29 +43 45,51,87-88 33 +43 85 43 +43 55,65,78,92 50 +43 47 100 +93 9 +44 56,78 10 +44 47 11 +44 50,54,64,91,95 12 +44 52,57,61,65,74,77 14 +44 87-88,97 17 +44 60,85 20 +44 55,84 22 +44 45,51,67,75 25 +44 81,92 27 +44 46,49,59,62,69,71,82,86,89,94 29 +44 80,90 30 +44 63,70,73,98-99 33 +44 72,79 38 +44 48 40 +44 58,66 43 +44 83 62 +90 10 +45 63,73,93 11 +45 72,84 12 +45 58,66-67,75-76 14 +45 46,49-50,53,59,62,64,69,82,86,89,94 17 +45 52,61,65,68,74,77,81,96 20 +45 60,70,98 25 +45 55-56,78-79 29 +45 47-48,87-88,92 33 +45 80,83 38 +45 54 40 +45 85,99 43 +45 51 100 +48,81,87-88,92 10 +46 80,83 11 +46 60 12 +46 56,72,78-79 14 +46 47,51,67,75-76 17 +46 64,82,91 20 +46 57,65 25 +46 63,70,73,85,93,98-99 29 +46 55 33 +46 58,66 40 +46 71 50 +46 59,62,69,94 100 +48 9 +47 83,90 10 +47 63,99 11 +47 56 12 +47 58,66,76 14 +47 50,53-54,59,62,69,71,94 17 +47 57,68,77,81,96 20 +47 80 22 +47 60,70,73,93,98 25 +47 79 29 +47 51,87-88 33 +47 85 43 +47 55,65,78,92 50 +55,70,78,93 8 +48 50,53,59,62,64,69,71,94-95 10 +48 52,61,65,68,74,77,96 11 +48 97 12 +48 81 14 +48 60,63,73,85 17 +48 56,79 18 +48 58,66-67,75 20 +48 49,54,82,86,89 22 +48 92 23 +48 90 25 +48 98-99 27 +48 84 30 +48 51,87-88 33 +48 80,83 36 +48 72 44 +90 11 +49 70,98 12 +49 56,72,79 14 +49 51,66-67 17 +49 50,54,71 20 +49 81,92 22 +49 52,61,74,77,80,83 25 +49 60,73,99 29 +49 84,97 33 +49 82,95 50 +49 86,89 100 +87-88,92 10 +50 80,83,90 11 +50 70,73,85,93 12 +50 56,78-79 14 +50 51 17 +50 53-54,82,86,89,91,95 20 +50 81 22 +50 65,68,96 25 +90 10 +51 63,73,93 11 +51 72,84 12 +51 58,66-67,75-76 14 +51 53,59,62,64,69,82,86,89,94 17 +51 52,61,65,68,74,77,81,96 20 +51 60,70,98 25 +51 55-56,78-79 29 +51 87-88,92 33 +51 80,83 38 +51 54 40 +51 85,99 43 +87 11 +52 80,83 12 +52 60,85,93,99 14 +52 56 17 +52 75 20 +52 54,81,86,89,91-92,95 25 +52 90 29 +52 57,70 33 +52 84 40 +52 61,74 100 +81,87 10 +53 83 11 +53 60,73,85,99 12 +53 55,72,84 14 +53 67,76 17 +53 54,64 20 +53 88,92 22 +53 65,68,80,96 25 +53 63,93 29 +53 78 33 +81,87-88 10 +54 83,90 11 +54 60,70,85,99 12 +54 78,84 14 +54 86,89 20 +54 92 22 +54 61,65,68,74,80,96 25 +54 93 29 +90 9 +55 93 10 +55 84 11 +55 75-76 12 +55 71,82 14 +55 57,65,77 17 +55 81 18 +55 83 20 +55 60,70 22 +55 56,72 25 +55 58,66-67 29 +55 87,92 30 +55 59,62,64,69,80,94 33 +55 63,85,98-99 38 +55 78-79 43 +55 88 44 +55 73 57 +80 9 +56 60,63,93 10 +56 72,78 11 +56 58,66-67,76 12 +56 59,62,64,69,86,89,91,94 14 +56 57,61,74,77 17 +56 92 18 +56 83,90 20 +56 70,73,98-99 22 +56 79,84 25 +56 75 29 +56 81,88 30 +56 82,95 33 +56 85 38 +56 87 44 +87 11 +57 90 12 +57 63,73,98 14 +57 78,84 17 +57 58,66,75 20 +57 59,62,69,71,81,91-92,94-95 25 +57 61,65,70,74,85,93 33 +81,92 9 +58 80,90 10 +58 70,93,99 11 +58 78-79 12 +58 67 14 +58 71,82 17 +58 65,68,87-88,96 20 +58 83 22 +58 73,85 25 +58 72 29 +58 66 33 +58 59,62,64,69,94 40 +58 63,98 43 +58 75 60 +81,87-88,92 10 +59 80,83 11 +59 60 12 +59 72,78-79 14 +59 67,75-76 17 +59 64,82,91 20 +59 65 25 +59 63,70,73,85,93,98-99 29 +59 66 40 +59 71 50 +59 62,69,94 100 +63,85 9 +60 72,78 10 +60 66-67 11 +60 62,69,82,91,94-95 12 +60 61,74 14 +60 87-88,97 17 +60 90 18 +60 73,93,98 20 +60 84 22 +60 81 27 +60 71,86,89 29 +60 80 30 +60 70,77 33 +60 79 38 +60 76 43 +60 83 44 +60 99 50 +60 92 56 +87 11 +61 80,83 12 +61 85,93,99 14 +61 75 20 +61 81,86,89,91-92,95 25 +61 90 29 +61 70 33 +61 84 40 +61 74 100 +81,87-88,92 10 +62 80,83 11 +62 72,78-79 14 +62 67,75-76 17 +62 64,82,91 20 +62 65 25 +62 63,70,73,85,93,98-99 29 +62 66 40 +62 71 50 +62 69,94 100 +70 9 +63 79,84 10 +63 71,82 12 +63 65 14 +63 81,87,92 17 +63 90 18 +63 93,98-99 20 +63 78 22 +63 66,75-76 25 +63 88 27 +63 64,69,94 29 +63 80,83 30 +63 73,85 33 +63 72 38 +63 67 43 +87 10 +64 83 11 +64 85 12 +64 78-79,84 14 +64 66 17 +64 69,82,94 20 +64 88 22 +64 68,80,96 25 +64 73,98-99 29 +64 72 33 +64 67,75 40 +81,87-88 11 +65 80 12 +65 70,73,98 14 +65 66 20 +65 69,71,92,94 25 +65 68,85,93,96 33 +65 78 40 +81,87-88 9 +66 70,93 11 +66 78,84 12 +66 75 14 +66 86,89,95 17 +66 77,92 20 +66 80,83 22 +66 85,97,99 25 +66 72,79 29 +66 67 33 +66 69,71,82,94 40 +66 73,98 43 +87,92 9 +67 90 10 +67 78 12 +67 75-76 14 +67 69,71,86,89,94-95 17 +67 77,88 20 +67 83 22 +67 85,97-98 25 +67 79,84 29 +67 82 40 +67 73,99 43 +67 72 50 +67 80 57 +87-88,92 11 +68 80 12 +68 85,93,98 14 +68 78 17 +68 75 20 +68 96 100 +81,87-88,92 10 +69 80,83 11 +69 72,78-79 14 +69 75-76 17 +69 82,91 20 +69 70,73,85,93,98-99 29 +69 71 50 +69 94 100 +80,88 8 +70 73 9 +70 75 11 +70 86,89,95 12 +70 77 14 +70 87 17 +70 98 20 +70 78-79,84 22 +70 76 25 +70 71,94 29 +70 83,90 30 +70 74,85,93,99 33 +70 92 40 +70 91 50 +70 81 56 +81 10 +71 80,83 11 +71 85 12 +71 72,78-79,84 14 +71 76 17 +71 82,86,89,91,95 20 +71 92 22 +71 77 25 +71 73,93,98-99 29 +71 97 33 +71 94 50 +81,92 8 +72 90 9 +72 85 10 +72 78 11 +72 86,89,94-95 14 +72 77 17 +72 87 18 +72 97 20 +72 79,84 25 +72 75 29 +72 88 30 +72 80,82-83 33 +72 73,98-99 38 +90 8 +73 93 9 +73 75 11 +73 77 14 +73 81,87,97 17 +73 83 18 +73 85 20 +73 78,84 22 +73 88,92 27 +73 86,89,94-95 29 +73 98-99 33 +73 79 38 +73 80 44 +73 82 50 +87 11 +74 80,83 12 +74 85,93,99 14 +74 75 20 +74 81,86,89,91-92,95 25 +74 90 29 +74 84 40 +81,92 9 +75 80 10 +75 93,99 11 +75 79,84 12 +75 82,91,94-95 17 +75 88,96 20 +75 83,90 22 +75 85,98 25 +75 87 33 +87-88 9 +76 80,90 10 +76 98 11 +76 78-79 12 +76 91,94 17 +76 77,81,92 20 +76 83 22 +76 85,93,99 25 +81,87-88 11 +77 80 12 +77 85 14 +77 78,84 17 +77 82,86,89,92,95 25 +77 83 29 +77 98-99 33 +77 79 40 +77 97 50 +83 9 +78 79,84 11 +78 94 14 +78 96 17 +78 81 18 +78 80 20 +78 93,98-99 22 +78 87,92 30 +78 85 38 +78 88 44 +84 11 +79 86,89,91,94-95 14 +79 81 18 +79 90,97 20 +79 85 22 +79 87-88,92 30 +79 80,82 33 +79 98-99 38 +79 83 50 +81 7 +80 93 8 +80 94-95 11 +80 96 12 +80 97 14 +80 98 18 +80 82,86-87,89 25 +80 83,90 27 +80 85 30 +80 84 33 +80 88,92 36 +80 99 44 +88 7 +81 82,94 10 +81 87 14 +81 99 17 +81 84 18 +81 86,89,91,95 22 +81 90 25 +81 85,93,98 27 +81 83 36 +81 92 45 +87-88,92 10 +82 85 12 +82 84 14 +82 94 20 +82 83 25 +82 98-99 29 +82 97 33 +82 86,89,95 50 +93 8 +83 91,94-95 11 +83 97 14 +83 85 18 +83 84 20 +83 86-89 25 +83 90 27 +83 98 30 +83 92 36 +83 99 44 +85,93,98 10 +84 91 14 +84 87-88 18 +84 97 20 +84 92 30 +84 86,89-90,95 33 +84 99 38 +91,95 12 +85 96 14 +85 90 18 +85 99 20 +85 88 27 +85 94 29 +85 93,98 33 +85 87,92 40 +90 11 +86 98 12 +86 92 22 +86 99 29 +86 97 33 +86 95 50 +86 89 100 +91,94-95 10 +87 96 11 +87 93,98-99 17 +87 92 33 +87 90 36 +87 88 78 +93 8 +88 94 10 +88 96 11 +88 98 17 +88 92 23 +88 90 25 +88 99 27 +90 11 +89 98 12 +89 92 22 +89 99 29 +89 97 33 +89 95 50 +93,99 8 +90 95 11 +90 91-92 25 +92 10 +91 99 12 +91 94-95 20 +91 93 29 +94 10 +92 96 11 +92 97 12 +92 95 22 +92 98-99 27 +92 93 40 +98-99 9 +93 95 12 +93 96 14 +93 94 29 +98-99 29 +98-99 12 +95 97 33 +98 14 +98-99 17 +99 33 diff --git a/tutorial/examples/bankssim/tf/0.2/sim.tf b/tutorial/examples/bankssim/tf/0.2/sim.tf new file mode 100644 index 0000000..b934f28 --- /dev/null +++ b/tutorial/examples/bankssim/tf/0.2/sim.tf @@ -0,0 +1,1102 @@ +@edge +@edgeValues +@converters=Dirk Roorda +@description=similarity between ayas, as a percentage of the common material wrt the combined material +@name=Banks (similar words) +@sourceUrl=https://nbviewer.jupyter.org/github/annotation/tutorials/blob/master/text-fabric/use.ipynb +@valueType=int +@version=0.2 +@writtenBy=Text-Fabric +@dateWritten=2019-04-09T12:06:30Z + +15 7 +1 2,9,12,23,41,43,47,76 8 +1 10-11,49-50,53-54,82,86,89,95 9 +1 8,14,30,32,65 10 +1 87-88 13 +1 5,60 15 +1 21,26,56,78-79 17 +1 31,45,51,58,66-67,75 18 +1 34,39-40,64,71 20 +1 48,92 21 +1 52,57,61,74 22 +1 80,83,90 23 +1 35,63,73,85,93,98 25 +1 37,55,72,84 27 +1 24,27,29,33,36 30 +1 16 31 +1 18,22,25,38,42,46,59,62,69,91,94 33 +1 44,99 36 +1 81 42 +1 20,70 50 +1 4,7 100 +4,7 8 +2 15,90 9 +2 70,99 10 +2 26,55-56,72,79 11 +2 9,12,33,36,41,45,51,58,67,76 12 +2 10-11,22,25,34,42,46,49-50,54,59,62,69,82,86,89,94-95 14 +2 3,6,13,17,19,28,57,68,77,96 17 +2 81,87-88 18 +2 18,38,80,83,97 20 +2 20,35,44,60,63,73,85,98 22 +2 21,37,78,84 25 +2 27,43,47,66 29 +2 16,48 30 +2 40,53,71 33 +2 5,93 38 +2 65 40 +2 92 44 +2 23 50 +16,87-88 11 +3 15,18,38 12 +3 5,63,70,85,93,98-99 14 +3 21,37,55-56,78-79 17 +3 27,43,45,47,51 20 +3 53,81,92 25 +3 83 29 +3 14,28,60,77 33 +3 76 50 +3 6 100 +15 7 +4 9,12,23,41,43,47,76 8 +4 10-11,49-50,53-54,82,86,89,95 9 +4 8,14,30,32,65 10 +4 87-88 13 +4 5,60 15 +4 21,26,56,78-79 17 +4 31,45,51,58,66-67,75 18 +4 34,39-40,64,71 20 +4 48,92 21 +4 52,57,61,74 22 +4 80,83,90 23 +4 35,63,73,85,93,98 25 +4 37,55,72,84 27 +4 24,27,29,33,36 30 +4 16 31 +4 18,22,25,38,42,46,59,62,69,91,94 33 +4 44,99 36 +4 81 42 +4 20,70 50 +4 7 100 +87,90 8 +5 70,85,98 9 +5 21,37,55-56,79 10 +5 23-24,29,33,41,43,47,66,76 11 +5 64,71 12 +5 6,13,17,19,28,52,61,65,68,74,77,96 14 +5 7 15 +5 16,88,97 17 +5 15 18 +5 20,44,63,93 20 +5 26,72,78 22 +5 9,12,27,31,36,45,51,67 25 +5 48,81 27 +5 11,34,40,50,54,82,95 29 +5 18,38,83 30 +5 35,60,73,99 33 +5 84 38 +5 92 40 +5 80 44 +5 10,49,53,86,89 50 +16,87-88 11 +6 15,18,38 12 +6 63,70,85,93,98-99 14 +6 21,37,55-56,78-79 17 +6 27,43,45,47,51 20 +6 53,81,92 25 +6 83 29 +6 14,28,60,77 33 +6 76 50 +15 7 +7 9,12,23,41,43,47,76 8 +7 10-11,49-50,53-54,82,86,89,95 9 +7 8,14,30,32,65 10 +7 87-88 13 +7 60 15 +7 21,26,56,78-79 17 +7 31,45,51,58,66-67,75 18 +7 34,39-40,64,71 20 +7 48,92 21 +7 52,57,61,74 22 +7 80,83,90 23 +7 35,63,73,85,93,98 25 +7 37,55,72,84 27 +7 24,27,29,33,36 30 +7 16 31 +7 18,22,25,38,42,46,59,62,69,91,94 33 +7 44,99 36 +7 81 42 +7 20,70 50 +16,48,87-88 11 +8 15,18,38,80,83 12 +8 44,63,73,85,93,98-99 14 +8 37,55-56,72,79 17 +8 9,12,24,33,36,41,45,51,58,66-67,75 20 +8 22,25,39,42,46,54,59,62,64,69,82,94 25 +8 29 50 +8 30,32 100 +81,87-88 9 +9 15,18,38,83,90 10 +9 44,60,70,85,99 11 +9 26,37,78,84 12 +9 23-24,29,31,36,41,43,47,66 14 +9 10-11,34,39,49-50,53,86,89 17 +9 13,17,19,28,30,32,48,52,61,65,68,74,92,96 20 +9 80 22 +9 20,35,93 25 +9 45,51 33 +9 40 40 +9 54 75 +9 12 100 +16 10 +10 15,90 11 +10 20,35,70,98 12 +10 21,56,72,79 14 +10 12,24,33,41,45,51,66-67 17 +10 40,50,54,71 20 +10 48,81,92 22 +10 18,38,52,61,74,77,80,83 25 +10 44,60,73,99 29 +10 26,84,97 33 +10 31 40 +10 11,34,82,95 50 +10 49,86,89 100 +16,81 10 +11 15 11 +11 20,63,70,73,85,98 12 +11 21,72,79 14 +11 12,24,31,33,41,45,51,66,76 17 +11 40,54,71,82,95 20 +11 48,92 22 +11 18,38,52,61,74,77,83,90 25 +11 35,44,60,99 29 +11 26,84,97 33 +11 67 40 +11 80 43 +11 34,49,86,89 50 +81,87-88 9 +12 15,18,38,83,90 10 +12 44,60,70,85,99 11 +12 26,37,78,84 12 +12 23-24,29,31,36,41,43,47,66 14 +12 34,39,49-50,53,86,89 17 +12 13,17,19,28,30,32,48,52,61,65,68,74,92,96 20 +12 80 22 +12 20,35,93 25 +12 45,51 33 +12 40 40 +12 54 75 +48,87-88,92 11 +13 15,80 12 +13 20,35,85,93,98 14 +13 26,37,78 17 +13 23,31,36,43,45,47,51,58,75 20 +13 40,50,53-54,64 25 +13 28,65 33 +13 17,19,68,96 100 +16,88 11 +14 15,18,38,83,90 12 +14 20,60,93,98-99 14 +14 21,26,37,55,78-79,84 17 +14 24,31,43,45,47,51,75-76 20 +14 81,87,91-92,95 25 +14 28,52,57,61,70,74,77,85 33 +14 56 40 +20,44,70,93 8 +15 21,72 9 +15 24,29,31,33,58,66,75 10 +15 22,25,40,42,46,49,53-54,59,62,64,69,86,89,94-95 11 +15 17,19,30,32,65,68,77,96 12 +15 16,48,81 15 +15 83,90 17 +15 35,60,63,98-99 18 +15 23,36,41,67,76 22 +15 50,82,92 25 +15 18,38 27 +15 28 29 +15 73 30 +15 37,55-56,78-79 33 +15 43,45,47,51 38 +15 80 40 +15 85 44 +15 87-88 50 +78 8 +16 29,31,36,45,51,76 9 +16 39-40,49,53,64,86,89,91 10 +16 30,32,52,61,65,74,77 11 +16 97 12 +16 20,35,70,99 17 +16 21,37,56,84 18 +16 23,27,33,43,47,67 20 +16 22,25,34,42,46,59,62,69,71,82,94-95 22 +16 88 23 +16 18,38,57,80,90 25 +16 60,85,93 27 +16 26,55,79 30 +16 24,41,48,58,66,75,81,87 33 +16 83 36 +16 44,63,73,98 40 +16 72 44 +16 92 45 +48,87-88,92 11 +17 80 12 +17 20,35,85,93,98 14 +17 26,37,78 17 +17 23,31,36,43,45,47,51,58,75 20 +17 40,50,53-54,64 25 +17 28,65 33 +17 19,68,96 100 +90,93 8 +18 23,31,75-76 10 +18 53-54,95 11 +18 28,30,32,52,57,61,65,74 12 +18 97 14 +18 20 18 +18 26,56 20 +18 24,27,29,33,36,41,43,47,58 22 +18 22,25,34,40,42,46,48-49,59,62,64,69,71,81-82,86-87,89,94 25 +18 77 29 +18 35,60,63,70,85 30 +18 21,37,72,79,84 33 +18 88,92 36 +18 45,51,66-67 38 +18 80,83 40 +18 44,73,98 44 +18 55,78 50 +18 99 62 +18 38 100 +48,87-88,92 11 +19 80 12 +19 20,35,85,93,98 14 +19 26,37,78 17 +19 23,31,36,43,45,47,51,58,75 20 +19 40,50,53-54,64 25 +19 28,65 33 +19 68,96 100 +88 8 +20 63,73,98 9 +20 55-56,79 10 +20 29,33,36,58,66,75-76 11 +20 34,39,49,53,86,89,95 12 +20 28,68,96 14 +20 48,87 17 +20 38,80,83 18 +20 60,99 20 +20 21,26,78,84 22 +20 23-24,27,31,43,45,47,51 25 +20 22,25,42,46,50,54,59,62,69,71,94 29 +20 90 30 +20 35,44,52,57,61,65,74,85 33 +20 37 38 +20 81,92 40 +20 40,91,93 50 +20 70 71 +48,87-88 8 +21 80 9 +21 35,63 10 +21 26,56,72,84 11 +21 23,29,41,45,51,58,67 12 +21 34,39-40,49,82,86,89,91,95 14 +21 28,57,65 17 +21 81 18 +21 83,97 20 +21 44,73,85,93 22 +21 55,78-79 25 +21 27,33,43,47,66,76 29 +21 92 30 +21 22,25,38,42,46,59,62,69,94 33 +21 60,70,98-99 38 +21 77 40 +21 37 43 +21 71 60 +48,81,87-88,92 10 +22 80,83 11 +22 35,60 12 +22 56,72,78-79 14 +22 23-24,36,41,43,45,47,51,67,75-76 17 +22 39-40,64,82,91 20 +22 30,32,38,57,65 25 +22 44,63,70,73,85,93,98-99 29 +22 55 33 +22 27,29,33,58,66 40 +22 71 50 +22 37 60 +22 25,42,46,59,62,69,94 100 +81 9 +23 38 10 +23 44,60,63,70,98 11 +23 56,79,84 12 +23 27,36,41,45,51,58,66 14 +23 25,42,46,50,53-54,59,62,69,71,94 17 +23 28,48,57,68,96 20 +23 80,90 22 +23 35,73,85,93 25 +23 37,55,78 29 +23 87-88,92 33 +23 40 40 +23 65 50 +23 43,47 60 +60,73,93,98 11 +24 37,55,79 12 +24 29,33,36,41,66-67 14 +24 25,34,40,42,46,49,54,59,62,64,69,82,86,89,91,94-95 17 +24 30,32,57,81,88,92 20 +24 38,80 22 +24 35,63,70,85,99 25 +24 56,72,84 29 +24 31,45,48,51,58,87 33 +24 83,90 38 +24 44 43 +24 26,52,61,74 50 +24 75 60 +48,81,87-88,92 10 +25 80,83 11 +25 35,60 12 +25 56,72,78-79 14 +25 36,41,43,45,47,51,67,75-76 17 +25 39-40,64,82,91 20 +25 30,32,38,57,65 25 +25 44,63,70,73,85,93,98-99 29 +25 55 33 +25 27,29,33,58,66 40 +25 71 50 +25 37 60 +25 42,46,59,62,69,94 100 +88 8 +26 63,73,85,93 10 +26 56,79 11 +26 33,41,45,51,66-67 12 +26 40,54,64,71,82,91 14 +26 57,68,77,96 17 +26 81,87 18 +26 38,80,97 20 +26 35,60,70,98-99 22 +26 72 25 +26 58 29 +26 48,92 30 +26 34,49,83,86,89-90,95 33 +26 44 38 +26 52,61,74 40 +26 84 43 +26 31,75 50 +88 9 +27 80,83 10 +27 44,85,98 11 +27 72,84 12 +27 33,36,43,47,58,66-67 14 +27 39-40,64,91 17 +27 57,65,81,92 20 +27 38 22 +27 35,60,70,73,99 25 +27 37,55,78 29 +27 29,76 33 +27 42,46,53,59,62,69,71,94 40 +27 63,93 43 +48,81 11 +28 38,80,83 12 +28 35,60,70,93,98-99 14 +28 55-56,79 17 +28 36,76 20 +28 40,50,53-54,87-88,92 25 +28 65,68,77,85,96 33 +28 37,78 40 +28 43,45,47,51 50 +48,87 9 +29 83 10 +29 35,44,60,70,85,98 11 +29 56,78-79,84 12 +29 41,45,51,58,66,75-76 14 +29 53-54,71,82,91 17 +29 88 20 +29 38,80 22 +29 63,73,93 25 +29 37,55,72 29 +29 33,36,67 33 +29 39,42,46,59,62,64,69,94 40 +29 99 43 +29 30,32 50 +48,87-88 11 +30 38,80,83 12 +30 44,63,73,85,93,98-99 14 +30 37,55-56,72,79 17 +30 33,36,41,45,51,58,66-67,75 20 +30 39,42,46,54,59,62,64,69,82,94 25 +30 32 100 +48,87 9 +31 38,80,83 10 +31 35,44,60,73,85,93,98-99 11 +31 45,51,58 14 +31 34,40,50,54,64,82,91 17 +31 57,68,92,96 20 +31 90 22 +31 70 25 +31 56,84 29 +31 75,81 33 +31 49,86,89,95 40 +31 52,61,74 50 +48,87-88 11 +32 38,80,83 12 +32 44,63,73,85,93,98-99 14 +32 37,55-56,72,79 17 +32 33,36,41,45,51,58,66-67,75 20 +32 39,42,46,54,59,62,64,69,82,94 25 +87-88,92 9 +33 63,70,85,93 11 +33 55-56,84 12 +33 45,51,58,75-76 14 +33 34,39,49,64,86,89,91,95 17 +33 48,77 20 +33 38,80,83 22 +33 44,60,73,97-98 25 +33 37,72,79 29 +33 36,41,66-67 33 +33 42,46,59,62,69,71,82,94 40 +33 99 43 +90 11 +34 35,70,73 12 +34 79 14 +34 41,45,51,66-67 17 +34 39-40,54,71,82,95 20 +34 81,92 22 +34 38,52,61,74,77,80,83 25 +34 44,60,98-99 29 +34 72,84,97 33 +34 48 38 +34 49,86,89 50 +60,98 9 +35 66,75-76 11 +35 42,46,49-50,59,62,64,69,71,86,89,94 12 +35 52,57,61,68,74,96 14 +35 81,87 17 +35 83 18 +35 70,73,93,99 20 +35 37,55,72,84 22 +35 36,43,45,47,51,58,67 25 +35 48,88,92 27 +35 53-54 29 +35 38,90 30 +35 44,65,85 33 +35 78 38 +35 80 44 +35 40,63 50 +92 9 +36 83 10 +36 44,93,98 11 +36 56,79,84 12 +36 41,43,47,58,66,75 14 +36 40,42,46,50,54,59,62,69,82,94 17 +36 48,65,68,87,96 20 +36 38 22 +36 63,73,85,99 25 +36 37,55,72,78 29 +36 45,51,67,88 33 +36 80 38 +36 53,64 40 +72 11 +37 41,67,75 12 +37 39,50,53-54,64,82,91 14 +37 57,68,77,96 17 +37 48,81 18 +37 80,83 20 +37 44,60,63,73 22 +37 56,79 25 +37 58,66,76 29 +37 87-88,92 30 +37 38,40,71 33 +37 70,93,98-99 38 +37 65 40 +37 55,78 43 +37 43,45,47,51 50 +37 85 57 +37 42,46,59,62,69,94 60 +90,93 8 +38 75-76 10 +38 53-54,95 11 +38 52,57,61,65,74 12 +38 97 14 +38 56 20 +38 41,43,47,58 22 +38 40,42,46,48-49,59,62,64,69,71,81-82,86-87,89,94 25 +38 77 29 +38 60,63,70,85 30 +38 72,79,84 33 +38 88,92 36 +38 45,51,66-67 38 +38 80,83 40 +38 44,73,98 44 +38 55,78 50 +38 99 62 +48,81 10 +39 60,70,98-99 12 +39 72 14 +39 76 17 +39 42,46,54,59,62,69,71,91,94 20 +39 93 29 +87-88 10 +40 83,90 11 +40 60,63,73,98-99 12 +40 55,84 14 +40 58,66 17 +40 42,46,49-50,53,59,62,69,71,86,89,94 20 +40 48,81 22 +40 52,57,61,68,74,80,96 25 +40 44,70,85,93 29 +40 78 33 +40 92 38 +40 43,45,47,51 40 +40 54 50 +40 65 67 +90 10 +41 63,85 11 +41 56,84 12 +41 43,45,47,51,58,75 14 +41 42,46,49,59,62,64,69,71,86,89,94-95 17 +41 48,77,87-88,92 20 +41 83 22 +41 44,60,97-99 25 +41 55,72 29 +41 67 33 +41 80 38 +41 82 40 +41 73 43 +41 79 50 +41 66 60 +48,81,87-88,92 10 +42 80,83 11 +42 60 12 +42 56,72,78-79 14 +42 43,45,47,51,67,75-76 17 +42 64,82,91 20 +42 57,65 25 +42 44,63,70,73,85,93,98-99 29 +42 55 33 +42 58,66 40 +42 71 50 +42 46,59,62,69,94 100 +48 9 +43 83,90 10 +43 44,63,99 11 +43 56 12 +43 58,66,76 14 +43 46,50,53-54,59,62,69,71,94 17 +43 57,68,77,81,96 20 +43 80 22 +43 60,70,73,93,98 25 +43 79 29 +43 45,51,87-88 33 +43 85 43 +43 55,65,78,92 50 +43 47 100 +93 9 +44 56,78 10 +44 47 11 +44 50,54,64,91,95 12 +44 52,57,61,65,74,77 14 +44 87-88,97 17 +44 60,85 20 +44 55,84 22 +44 45,51,67,75 25 +44 81,92 27 +44 46,49,59,62,69,71,82,86,89,94 29 +44 80,90 30 +44 63,70,73,98-99 33 +44 72,79 38 +44 48 40 +44 58,66 43 +44 83 62 +90 10 +45 63,73,93 11 +45 72,84 12 +45 58,66-67,75-76 14 +45 46,49-50,53,59,62,64,69,82,86,89,94 17 +45 52,61,65,68,74,77,81,96 20 +45 60,70,98 25 +45 55-56,78-79 29 +45 47-48,87-88,92 33 +45 80,83 38 +45 54 40 +45 85,99 43 +45 51 100 +48,81,87-88,92 10 +46 80,83 11 +46 60 12 +46 56,72,78-79 14 +46 47,51,67,75-76 17 +46 64,82,91 20 +46 57,65 25 +46 63,70,73,85,93,98-99 29 +46 55 33 +46 58,66 40 +46 71 50 +46 59,62,69,94 100 +48 9 +47 83,90 10 +47 63,99 11 +47 56 12 +47 58,66,76 14 +47 50,53-54,59,62,69,71,94 17 +47 57,68,77,81,96 20 +47 80 22 +47 60,70,73,93,98 25 +47 79 29 +47 51,87-88 33 +47 85 43 +47 55,65,78,92 50 +55,70,78,93 8 +48 50,53,59,62,64,69,71,94-95 10 +48 52,61,65,68,74,77,96 11 +48 97 12 +48 81 14 +48 60,63,73,85 17 +48 56,79 18 +48 58,66-67,75 20 +48 49,54,82,86,89 22 +48 92 23 +48 90 25 +48 98-99 27 +48 84 30 +48 51,87-88 33 +48 80,83 36 +48 72 44 +90 11 +49 70,98 12 +49 56,72,79 14 +49 51,66-67 17 +49 50,54,71 20 +49 81,92 22 +49 52,61,74,77,80,83 25 +49 60,73,99 29 +49 84,97 33 +49 82,95 50 +49 86,89 100 +87-88,92 10 +50 80,83,90 11 +50 70,73,85,93 12 +50 56,78-79 14 +50 51 17 +50 53-54,82,86,89,91,95 20 +50 81 22 +50 65,68,96 25 +90 10 +51 63,73,93 11 +51 72,84 12 +51 58,66-67,75-76 14 +51 53,59,62,64,69,82,86,89,94 17 +51 52,61,65,68,74,77,81,96 20 +51 60,70,98 25 +51 55-56,78-79 29 +51 87-88,92 33 +51 80,83 38 +51 54 40 +51 85,99 43 +87 11 +52 80,83 12 +52 60,85,93,99 14 +52 56 17 +52 75 20 +52 54,81,86,89,91-92,95 25 +52 90 29 +52 57,70 33 +52 84 40 +52 61,74 100 +81,87 10 +53 83 11 +53 60,73,85,99 12 +53 55,72,84 14 +53 67,76 17 +53 54,64 20 +53 88,92 22 +53 65,68,80,96 25 +53 63,93 29 +53 78 33 +81,87-88 10 +54 83,90 11 +54 60,70,85,99 12 +54 78,84 14 +54 86,89 20 +54 92 22 +54 61,65,68,74,80,96 25 +54 93 29 +90 9 +55 93 10 +55 84 11 +55 75-76 12 +55 71,82 14 +55 57,65,77 17 +55 81 18 +55 83 20 +55 60,70 22 +55 56,72 25 +55 58,66-67 29 +55 87,92 30 +55 59,62,64,69,80,94 33 +55 63,85,98-99 38 +55 78-79 43 +55 88 44 +55 73 57 +80 9 +56 60,63,93 10 +56 72,78 11 +56 58,66-67,76 12 +56 59,62,64,69,86,89,91,94 14 +56 57,61,74,77 17 +56 92 18 +56 83,90 20 +56 70,73,98-99 22 +56 79,84 25 +56 75 29 +56 81,88 30 +56 82,95 33 +56 85 38 +56 87 44 +87 11 +57 90 12 +57 63,73,98 14 +57 78,84 17 +57 58,66,75 20 +57 59,62,69,71,81,91-92,94-95 25 +57 61,65,70,74,85,93 33 +81,92 9 +58 80,90 10 +58 70,93,99 11 +58 78-79 12 +58 67 14 +58 71,82 17 +58 65,68,87-88,96 20 +58 83 22 +58 73,85 25 +58 72 29 +58 66 33 +58 59,62,64,69,94 40 +58 63,98 43 +58 75 60 +81,87-88,92 10 +59 80,83 11 +59 60 12 +59 72,78-79 14 +59 67,75-76 17 +59 64,82,91 20 +59 65 25 +59 63,70,73,85,93,98-99 29 +59 66 40 +59 71 50 +59 62,69,94 100 +63,85 9 +60 72,78 10 +60 66-67 11 +60 62,69,82,91,94-95 12 +60 61,74 14 +60 87-88,97 17 +60 90 18 +60 73,93,98 20 +60 84 22 +60 81 27 +60 71,86,89 29 +60 80 30 +60 70,77 33 +60 79 38 +60 76 43 +60 83 44 +60 99 50 +60 92 56 +87 11 +61 80,83 12 +61 85,93,99 14 +61 75 20 +61 81,86,89,91-92,95 25 +61 90 29 +61 70 33 +61 84 40 +61 74 100 +81,87-88,92 10 +62 80,83 11 +62 72,78-79 14 +62 67,75-76 17 +62 64,82,91 20 +62 65 25 +62 63,70,73,85,93,98-99 29 +62 66 40 +62 71 50 +62 69,94 100 +70 9 +63 79,84 10 +63 71,82 12 +63 65 14 +63 81,87,92 17 +63 90 18 +63 93,98-99 20 +63 78 22 +63 66,75-76 25 +63 88 27 +63 64,69,94 29 +63 80,83 30 +63 73,85 33 +63 72 38 +63 67 43 +87 10 +64 83 11 +64 85 12 +64 78-79,84 14 +64 66 17 +64 69,82,94 20 +64 88 22 +64 68,80,96 25 +64 73,98-99 29 +64 72 33 +64 67,75 40 +81,87-88 11 +65 80 12 +65 70,73,98 14 +65 66 20 +65 69,71,92,94 25 +65 68,85,93,96 33 +65 78 40 +81,87-88 9 +66 70,93 11 +66 78,84 12 +66 75 14 +66 86,89,95 17 +66 77,92 20 +66 80,83 22 +66 85,97,99 25 +66 72,79 29 +66 67 33 +66 69,71,82,94 40 +66 73,98 43 +87,92 9 +67 90 10 +67 78 12 +67 75-76 14 +67 69,71,86,89,94-95 17 +67 77,88 20 +67 83 22 +67 85,97-98 25 +67 79,84 29 +67 82 40 +67 73,99 43 +67 72 50 +67 80 57 +87-88,92 11 +68 80 12 +68 85,93,98 14 +68 78 17 +68 75 20 +68 96 100 +81,87-88,92 10 +69 80,83 11 +69 72,78-79 14 +69 75-76 17 +69 82,91 20 +69 70,73,85,93,98-99 29 +69 71 50 +69 94 100 +80,88 8 +70 73 9 +70 75 11 +70 86,89,95 12 +70 77 14 +70 87 17 +70 98 20 +70 78-79,84 22 +70 76 25 +70 71,94 29 +70 83,90 30 +70 74,85,93,99 33 +70 92 40 +70 91 50 +70 81 56 +81 10 +71 80,83 11 +71 85 12 +71 72,78-79,84 14 +71 76 17 +71 82,86,89,91,95 20 +71 92 22 +71 77 25 +71 73,93,98-99 29 +71 97 33 +71 94 50 +81,92 8 +72 90 9 +72 85 10 +72 78 11 +72 86,89,94-95 14 +72 77 17 +72 87 18 +72 97 20 +72 79,84 25 +72 75 29 +72 88 30 +72 80,82-83 33 +72 73,98-99 38 +90 8 +73 93 9 +73 75 11 +73 77 14 +73 81,87,97 17 +73 83 18 +73 85 20 +73 78,84 22 +73 88,92 27 +73 86,89,94-95 29 +73 98-99 33 +73 79 38 +73 80 44 +73 82 50 +87 11 +74 80,83 12 +74 85,93,99 14 +74 75 20 +74 81,86,89,91-92,95 25 +74 90 29 +74 84 40 +81,92 9 +75 80 10 +75 93,99 11 +75 79,84 12 +75 82,91,94-95 17 +75 88,96 20 +75 83,90 22 +75 85,98 25 +75 87 33 +87-88 9 +76 80,90 10 +76 98 11 +76 78-79 12 +76 91,94 17 +76 77,81,92 20 +76 83 22 +76 85,93,99 25 +81,87-88 11 +77 80 12 +77 85 14 +77 78,84 17 +77 82,86,89,92,95 25 +77 83 29 +77 98-99 33 +77 79 40 +77 97 50 +83 9 +78 79,84 11 +78 94 14 +78 96 17 +78 81 18 +78 80 20 +78 93,98-99 22 +78 87,92 30 +78 85 38 +78 88 44 +84 11 +79 86,89,91,94-95 14 +79 81 18 +79 90,97 20 +79 85 22 +79 87-88,92 30 +79 80,82 33 +79 98-99 38 +79 83 50 +81 7 +80 93 8 +80 94-95 11 +80 96 12 +80 97 14 +80 98 18 +80 82,86-87,89 25 +80 83,90 27 +80 85 30 +80 84 33 +80 88,92 36 +80 99 44 +88 7 +81 82,94 10 +81 87 14 +81 99 17 +81 84 18 +81 86,89,91,95 22 +81 90 25 +81 85,93,98 27 +81 83 36 +81 92 45 +87-88,92 10 +82 85 12 +82 84 14 +82 94 20 +82 83 25 +82 98-99 29 +82 97 33 +82 86,89,95 50 +93 8 +83 91,94-95 11 +83 97 14 +83 85 18 +83 84 20 +83 86-89 25 +83 90 27 +83 98 30 +83 92 36 +83 99 44 +85,93,98 10 +84 91 14 +84 87-88 18 +84 97 20 +84 92 30 +84 86,89-90,95 33 +84 99 38 +91,95 12 +85 96 14 +85 90 18 +85 99 20 +85 88 27 +85 94 29 +85 93,98 33 +85 87,92 40 +90 11 +86 98 12 +86 92 22 +86 99 29 +86 97 33 +86 95 50 +86 89 100 +91,94-95 10 +87 96 11 +87 93,98-99 17 +87 92 33 +87 90 36 +87 88 78 +93 8 +88 94 10 +88 96 11 +88 98 17 +88 92 23 +88 90 25 +88 99 27 +90 11 +89 98 12 +89 92 22 +89 99 29 +89 97 33 +89 95 50 +93,99 8 +90 95 11 +90 91-92 25 +92 10 +91 99 12 +91 94-95 20 +91 93 29 +94 10 +92 96 11 +92 97 12 +92 95 22 +92 98-99 27 +92 93 40 +98-99 9 +93 95 12 +93 96 14 +93 94 29 +98-99 29 +98-99 12 +95 97 33 +98 14 +98-99 17 +99 33 diff --git a/tutorial/images/dans.png b/tutorial/images/dans.png new file mode 100755 index 0000000..0c0869a Binary files /dev/null and b/tutorial/images/dans.png differ diff --git a/tutorial/images/phblogo.png b/tutorial/images/phblogo.png new file mode 100644 index 0000000..2789817 Binary files /dev/null and b/tutorial/images/phblogo.png differ diff --git a/tutorial/images/tf-small.png b/tutorial/images/tf-small.png new file mode 100644 index 0000000..57fc227 Binary files /dev/null and b/tutorial/images/tf-small.png differ diff --git a/tutorial/images/tf.png b/tutorial/images/tf.png new file mode 100644 index 0000000..6f5e6f8 Binary files /dev/null and b/tutorial/images/tf.png differ diff --git a/tutorial/repo.ipynb b/tutorial/repo.ipynb new file mode 100644 index 0000000..4a2a7b1 --- /dev/null +++ b/tutorial/repo.ipynb @@ -0,0 +1,1647 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "---\n", + "Start with [convert](https://nbviewer.jupyter.org/github/annotation/banks/blob/master/programs/convert.ipynb)\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Getting data from online repos\n", + "\n", + "We show the various automatic ways by which you can get data that is out there on GitHub to your computer.\n", + "\n", + "The work horse is the function `checkoutRepo()` in `tf.applib.repo`.\n", + "\n", + "Text-Fabric uses this function for all operations where data flows from GitHub to your computer.\n", + "\n", + "There are quite some options, and here we explain all the `checkout` options, i.e. the selection of\n", + "data from the history.\n", + "\n", + "See also the [documentation](https://annotation.github.io/text-fabric/tf/advanced/repo.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Leading example\n", + "\n", + "We use markdown display from IPython purely for presentation.\n", + "It is not needed to run `checkoutRepo()`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from tf.advanced.helpers import dm\n", + "from tf.advanced.repo import checkoutRepo" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We work with our tiny example TF app: `banks`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "ORG = \"annotation\"\n", + "REPO = \"banks\"\n", + "MAIN = \"tf\"\n", + "MOD = \"sim/tf\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`MAIN`points to the main data, `MOD` points to a module of data: the similarity feature." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "## Presenting the results\n", + "\n", + "The function `do()` just formats the results of a `checkoutRepo()` run.\n", + "\n", + "The result of such a run, after the progress messages, is a tuple.\n", + "For the explanation of the tuple, read the [docs](https://annotation.github.io/text-fabric/tf/advanced/repo.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "def do(task):\n", + " md = f\"\"\"\n", + "commit | release | local | base | subdir\n", + "--- | --- | --- | --- | ---\n", + "`{task[0]}` | `{task[1]}` | `{task[2]}` | `{task[3]}` | `{task[4]}`\n", + "\"\"\"\n", + " dm(md)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "incorrectly_encoded_metadata": "toc-hr-collapsed=false" + }, + "source": [ + "## All the checkout options\n", + "\n", + "We discuss the meaning and effects of the values you can pass to the `checkout` option." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `clone`\n", + "\n", + "> Look whether the appropriate folder exists under your `~/github` directory.\n", + "\n", + "This is merely a check whether your data exists in the expected location.\n", + "\n", + "* No online checks take place.\n", + "* No data is moved or copied.\n", + "\n", + "**NB**: you cannot select releases and commits in your *local* GitHub clone.\n", + "The data will be used as it is found on your file system.\n", + "\n", + "**When to use**\n", + "\n", + "> If you are developing new feature data.\n", + "\n", + "When you develop your data in a repository, your development is private as long as you\n", + "do not push to GitHub.\n", + "\n", + "You can test your data, even without locally committing your data.\n", + "\n", + "But, if you are ready to share your data, everything is in place, and you only\n", + "have to commit and push, and pass the location on github to others, like\n", + "\n", + "```\n", + "myorg/myrepo/subfolder\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "data: ~/github/annotation/banks/tf/0.2" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "\n", + "commit | release | local | base | subdir\n", + "--- | --- | --- | --- | ---\n", + "`None` | `None` | `clone` | `~/github` | `annotation/banks/tf`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version=\"0.2\", checkout=\"clone\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We show what happens if you do not have a local github clone in `~/github`." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "%%sh\n", + "\n", + "mv ~/github/annotation/banks/tf ~/github/annotation/banks/tfxxx" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The requested data is not available offline\n" + ] + }, + { + "data": { + "text/markdown": [ + "\n", + "commit | release | local | base | subdir\n", + "--- | --- | --- | --- | ---\n", + "`None` | `None` | `False` | `False` | `None`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version=\"0.2\", checkout=\"clone\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that no attempt is made to retrieve online data." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "%%sh\n", + "\n", + "mv ~/github/annotation/banks/tfxxx ~/github/annotation/banks/tf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `local`\n", + "\n", + "> Look whether the appropriate folder exists under your `~/text-fabric-data` directory.\n", + "\n", + "This is merely a check whether your data exists in the expected location.\n", + "\n", + "* No online checks take place.\n", + "* No data is moved or copied.\n", + "\n", + "**When to use**\n", + "\n", + "> If you are using data created and shared by others, and if the data\n", + "is already on your system.\n", + "\n", + "You can be sure that no updates are downloaded, and that everything works the same as the last time\n", + "you ran your program.\n", + "\n", + "If you do not already have the data, you have to pass `latest` or `hot` or `''` which will be discussed below." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "data: ~/text-fabric-data/annotation/banks/tf/0.2" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "\n", + "commit | release | local | base | subdir\n", + "--- | --- | --- | --- | ---\n", + "`9713e71c18fd296cf1860d6411312f9127710ba7` | `v2.0` | `local` | `~/text-fabric-data` | `annotation/banks/tf`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version=\"0.2\", checkout=\"local\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You see this data because earlier I have downloaded release `v2.0`, which is a tag for\n", + "the commit with hash `9713e71c18fd296cf1860d6411312f9127710ba7`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you do not have any corresponding data in your `~/text-fabric-data`, you get this:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "%%sh\n", + "\n", + "mv ~/text-fabric-data/annotation/banks/tf ~/text-fabric-data/annotation/banks/tfxxx" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The requested data is not available offline\n" + ] + }, + { + "data": { + "text/markdown": [ + "\n", + "commit | release | local | base | subdir\n", + "--- | --- | --- | --- | ---\n", + "`None` | `None` | `False` | `False` | `None`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version=\"0.2\", checkout=\"local\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "%%sh\n", + "\n", + "mv ~/text-fabric-data/annotation/banks/tfxxx ~/text-fabric-data/annotation/banks/tf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `''` (default)\n", + "\n", + "This is about when you omit the `checkout` parameter, or pass `''` to it.\n", + "\n", + "The destination for local data is your `~/text-fabric-data` folder.\n", + "\n", + "If you have already a local copy of the data, that will be used.\n", + "\n", + "If not:\n", + "\n", + "> Note that if your local data is outdated, no new data will be downloaded.\n", + "You need `latest` or `hot` for that.\n", + "\n", + "But what is the latest online copy? In this case we mean:\n", + "\n", + "* the latest *release*, and from that release an appropriate attached zip file\n", + "* but if there is no such zip file, we take the files from the corresponding commit\n", + "* but if there is no release at all, we take the files from the *latest commit*.\n", + "\n", + "**When to use**\n", + "\n", + "> If you need data created/shared by other people and you want to be sure that you always have the\n", + "same copy that you initially downloaded.\n", + "\n", + "* If the data provider makes releases after important modifications, you will get those.\n", + "* If the data provider is experimenting after the latest release, and commits them to GitHub,\n", + " you do not get those.\n", + "\n", + "However, with `hot`, you `can` get the latest commit, to be discussed below." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "data: ~/text-fabric-data/annotation/banks/tf/0.2" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "\n", + "commit | release | local | base | subdir\n", + "--- | --- | --- | --- | ---\n", + "`9713e71c18fd296cf1860d6411312f9127710ba7` | `v2.0` | `local` | `~/text-fabric-data` | `annotation/banks/tf`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version=\"0.2\", checkout=\"\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that no data has been downloaded, because it has detected that there is already local data on your computer." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you do not have any checkout of this data on your computer, the data will be downloaded." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "%%sh\n", + "\n", + "rm -rf ~/text-fabric-data/annotation/banks/tf" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The requested data is not available offline\n", + "rate limit is 5000 requests per hour, with 4994 left for this hour\n", + "\tconnecting to online GitHub repo annotation/banks ... connected\n", + "\tdownloading https://github.com/annotation/banks/releases/download/v2.0/tf-0.2.zip ... \n", + "\tunzipping ... \n", + "\tsaving data\n" + ] + }, + { + "data": { + "text/html": [ + "data: ~/text-fabric-data/annotation/banks/tf/0.2" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "\n", + "commit | release | local | base | subdir\n", + "--- | --- | --- | --- | ---\n", + "`9713e71c18fd296cf1860d6411312f9127710ba7` | `v2.0` | `None` | `~/text-fabric-data` | `annotation/banks/tf`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version=\"0.2\", checkout=\"\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Note about versions and releases\n", + "\n", + "The **version** of the data is not necessarily the same concept as the **release** of it.\n", + "\n", + "It is possible to keep the versions and the releases strictly parallel,\n", + "but in text conversion workflows it can be handy to make a distinction between them,\n", + "e.g. as follows:\n", + "\n", + "> the version is a property of the input data\n", + "> the release is a property of the output data\n", + "\n", + "When you create data from sources using conversion algorithms,\n", + "you want to increase the version if you get new input data, e.g. as a result of corrections\n", + "made by the author.\n", + "\n", + "But if you modify your conversion algorithm, while still running it on the same input data,\n", + "you may release the new output data as a **new release** of the **same version**.\n", + "\n", + "Likewise, when the input data stays the same, but you have corrected typos in the metadata,\n", + "you can make a **new release** of the **same version** of the data.\n", + "\n", + "The conversion delivers the features under a specific version,\n", + "and Text-Fabric supports those versions: users of TF can select the version they work with.\n", + "\n", + "Releases are made in the version control system (git and GitHub).\n", + "The part of Text-Fabric that auto-downloads data is aware of releases.\n", + "But once the data has been downloaded in place, there is no machinery in Text-Fabric to handle\n", + "different releases.\n", + "\n", + "Yet the release tag and commit hash are passed on to the point where it comes to recording\n", + "the provenance of the data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Download a different version\n", + "\n", + "We download version `0.1` of the data." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The requested data is not available offline\n", + "rate limit is 5000 requests per hour, with 4985 left for this hour\n", + "\tconnecting to online GitHub repo annotation/banks ... connected\n", + "\ttf/0.1/author.tf...downloaded\n", + "\ttf/0.1/gap.tf...downloaded\n", + "\ttf/0.1/letters.tf...downloaded\n", + "\ttf/0.1/number.tf...downloaded\n", + "\ttf/0.1/oslots.tf...downloaded\n", + "\ttf/0.1/otext.tf...downloaded\n", + "\ttf/0.1/otype.tf...downloaded\n", + "\ttf/0.1/punc.tf...downloaded\n", + "\ttf/0.1/terminator.tf...downloaded\n", + "\ttf/0.1/title.tf...downloaded\n", + "\tOK\n" + ] + }, + { + "data": { + "text/html": [ + "data: ~/text-fabric-data/annotation/banks/tf/0.1" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "\n", + "commit | release | local | base | subdir\n", + "--- | --- | --- | --- | ---\n", + "`9713e71c18fd296cf1860d6411312f9127710ba7` | `v2.0` | `None` | `~/text-fabric-data` | `annotation/banks/tf`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version=\"0.1\", checkout=\"\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Several observations:\n", + "\n", + "* we obtained the older version from the *latest* release, which is still release `v2.0`\n", + "* the download looks different from when we downloaded version `0.2`;\n", + " this is because the data producer has zipped the `0.2` data and has attached it to release `v2.0`,\n", + " but he forgot, or deliberately refused, to attach version `0.1` to that release;\n", + " so it has been retrieved directly from the files in the corresponding commit, which is\n", + " `9713e71c18fd296cf1860d6411312f9127710ba7`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the verification, an online check is needed. The verification consists of checking the release tag and/or commit hash.\n", + "\n", + "If there is no online connection, you get this:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "%%sh\n", + "\n", + "networksetup -setairportpower en0 off" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "no internet\n", + "The offline data may not be the latest\n" + ] + }, + { + "data": { + "text/html": [ + "data: ~/text-fabric-data/annotation/banks/tf/0.1" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "\n", + "commit | release | local | base | subdir\n", + "--- | --- | --- | --- | ---\n", + "`9713e71c18fd296cf1860d6411312f9127710ba7` | `v2.0` | `None` | `~/text-fabric-data` | `annotation/banks/tf`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version=\"0.1\", checkout=\"latest\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "or if you do not have local data:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "%%sh\n", + "\n", + "mv ~/text-fabric-data/annotation/banks/tf/0.1 ~/text-fabric-data/annotation/banks/tf/0.1xxx" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "no internet\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The requested data is not available offline\n" + ] + }, + { + "data": { + "text/markdown": [ + "\n", + "commit | release | local | base | subdir\n", + "--- | --- | --- | --- | ---\n", + "`None` | `None` | `False` | `False` | `None`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version=\"0.1\", checkout=\"latest\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "%%sh\n", + "\n", + "mv ~/text-fabric-data/annotation/banks/tf/0.1xxx ~/text-fabric-data/annotation/banks/tf/0.1" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "%%sh\n", + "\n", + "networksetup -setairportpower en0 on" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `latest`\n", + "\n", + "> The latest online release will be identified,\n", + "and if you do not have that copy locally, it will be downloaded.\n", + "\n", + "**When to use**\n", + "\n", + "> If you need data created/shared by other people and you want to be sure that you always have the\n", + "latest *stable* version of that data, unreleased data is not good enough.\n", + "\n", + "One of the difference with `checkout=''` is that if there are no releases, you will not get data." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rate limit is 5000 requests per hour, with 4963 left for this hour\n", + "\tconnecting to online GitHub repo annotation/banks ... connected\n" + ] + }, + { + "data": { + "text/html": [ + "data: ~/text-fabric-data/annotation/banks/tf/0.2" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "\n", + "commit | release | local | base | subdir\n", + "--- | --- | --- | --- | ---\n", + "`9713e71c18fd296cf1860d6411312f9127710ba7` | `v2.0` | `None` | `~/text-fabric-data` | `annotation/banks/tf`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version=\"0.2\", checkout=\"latest\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There is no sim/tf data in any release commit, so if we look it up, it should fail." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rate limit is 5000 requests per hour, with 4960 left for this hour\n", + "\tconnecting to online GitHub repo annotation/banks ... connected\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No directory sim/tf/0.2 in #9713e71c18fd296cf1860d6411312f9127710ba7\tFailed" + ] + }, + { + "data": { + "text/markdown": [ + "\n", + "commit | release | local | base | subdir\n", + "--- | --- | --- | --- | ---\n", + "`None` | `None` | `False` | `False` | `None`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "do(checkoutRepo(org=ORG, repo=REPO, folder=MOD, version=\"0.2\", checkout=\"latest\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "But with `checkout=''` it will only be found if you do not have local data already:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "data: ~/text-fabric-data/annotation/banks/sim/tf/0.2" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "\n", + "commit | release | local | base | subdir\n", + "--- | --- | --- | --- | ---\n", + "`8d87675fd02ee96ad6f4c3a5ce99e0bda8277a54` | `None` | `local` | `~/text-fabric-data` | `annotation/banks/sim/tf`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "do(checkoutRepo(org=ORG, repo=REPO, folder=MOD, version=\"0.2\", checkout=\"\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In that case there is only one way: `hot`:" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rate limit is 5000 requests per hour, with 4950 left for this hour\n", + "\tconnecting to online GitHub repo annotation/banks ... connected\n" + ] + }, + { + "data": { + "text/html": [ + "data: ~/text-fabric-data/annotation/banks/sim/tf/0.2" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "\n", + "commit | release | local | base | subdir\n", + "--- | --- | --- | --- | ---\n", + "`8d87675fd02ee96ad6f4c3a5ce99e0bda8277a54` | `None` | `None` | `~/text-fabric-data` | `annotation/banks/sim/tf`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "do(checkoutRepo(org=ORG, repo=REPO, folder=MOD, version=\"0.2\", checkout=\"hot\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `hot`\n", + "\n", + "> The latest online commit will be identified,\n", + "and if you do not have that copy locally, it will be downloaded.\n", + "\n", + "**When to use**\n", + "\n", + "> If you need data created/shared by other people and you want to be sure that you always have the\n", + "latest version of that data, whether released or not.\n", + "\n", + "The difference with `checkout=''` is that if there are releases,\n", + "you will now get data that may be newer than the latest release." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rate limit is 5000 requests per hour, with 4947 left for this hour\n", + "\tconnecting to online GitHub repo annotation/banks ... connected\n", + "\ttf/0.2/author.tf...downloaded\n", + "\ttf/0.2/gap.tf...downloaded\n", + "\ttf/0.2/letters.tf...downloaded\n", + "\ttf/0.2/number.tf...downloaded\n", + "\ttf/0.2/oslots.tf...downloaded\n", + "\ttf/0.2/otext.tf...downloaded\n", + "\ttf/0.2/otype.tf...downloaded\n", + "\ttf/0.2/punc.tf...downloaded\n", + "\ttf/0.2/terminator.tf...downloaded\n", + "\ttf/0.2/title.tf...downloaded\n", + "\tOK\n" + ] + }, + { + "data": { + "text/html": [ + "data: ~/text-fabric-data/annotation/banks/tf/0.2" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "\n", + "commit | release | local | base | subdir\n", + "--- | --- | --- | --- | ---\n", + "`8d87675fd02ee96ad6f4c3a5ce99e0bda8277a54` | `None` | `None` | `~/text-fabric-data` | `annotation/banks/tf`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version=\"0.2\", checkout=\"hot\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Observe that data has been downloaded, and that we have now data corresponding to a different commit hash,\n", + "and not corresponding to a release.\n", + "\n", + "If we now ask for the latest *stable* data, the data will be downloaded anew." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rate limit is 5000 requests per hour, with 4931 left for this hour\n", + "\tconnecting to online GitHub repo annotation/banks ... connected\n", + "\tdownloading https://github.com/annotation/banks/releases/download/v2.0/tf-0.2.zip ... \n", + "\tunzipping ... \n", + "\tsaving data\n" + ] + }, + { + "data": { + "text/html": [ + "data: ~/text-fabric-data/annotation/banks/tf/0.2" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "\n", + "commit | release | local | base | subdir\n", + "--- | --- | --- | --- | ---\n", + "`9713e71c18fd296cf1860d6411312f9127710ba7` | `v2.0` | `None` | `~/text-fabric-data` | `annotation/banks/tf`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version=\"0.2\", checkout=\"latest\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `v1.0` a specific release\n", + "\n", + "> Look for a specific online release to get data from.\n", + "\n", + "**When to use**\n", + "\n", + "> When you want to replicate something, and need data from an earlier point in the history." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rate limit is 5000 requests per hour, with 4924 left for this hour\n", + "\tconnecting to online GitHub repo annotation/banks ... connected\n", + "\tdownloading https://github.com/annotation/banks/releases/download/v1.0/tf-0.1.zip ... \n", + "\tunzipping ... \n", + "\tsaving data\n" + ] + }, + { + "data": { + "text/html": [ + "data: ~/text-fabric-data/annotation/banks/tf/0.1" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "\n", + "commit | release | local | base | subdir\n", + "--- | --- | --- | --- | ---\n", + "`5b7dca212dd456e705f4c2cb1aa0f895ab5b2fc9` | `v1.0` | `None` | `~/text-fabric-data` | `annotation/banks/tf`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version=\"0.1\", checkout=\"v1.0\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We might try to get version `0.2` from this release." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rate limit is 5000 requests per hour, with 4917 left for this hour\n", + "\tconnecting to online GitHub repo annotation/banks ... connected\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No directory tf/0.2 in #5b7dca212dd456e705f4c2cb1aa0f895ab5b2fc9\tFailed" + ] + }, + { + "data": { + "text/markdown": [ + "\n", + "commit | release | local | base | subdir\n", + "--- | --- | --- | --- | ---\n", + "`None` | `None` | `False` | `False` | `None`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version=\"0.2\", checkout=\"v1.0\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At that early point in the history there is not yet a version `0.2` of the data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `a81746c` a specific commit\n", + "\n", + "> Look for a specific online commit to get data from.\n", + "\n", + "**When to use**\n", + "\n", + "> When you want to replicate something, and need data from an earlier point in the history, and there is no\n", + "release for that commit." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rate limit is 5000 requests per hour, with 4907 left for this hour\n", + "\tconnecting to online GitHub repo annotation/banks ... connected\n", + "\ttf/0.1/author.tf...downloaded\n", + "\ttf/0.1/gap.tf...downloaded\n", + "\ttf/0.1/letters.tf...downloaded\n", + "\ttf/0.1/number.tf...downloaded\n", + "\ttf/0.1/oslots.tf...downloaded\n", + "\ttf/0.1/otext.tf...downloaded\n", + "\ttf/0.1/otype.tf...downloaded\n", + "\ttf/0.1/punc.tf...downloaded\n", + "\ttf/0.1/terminator.tf...downloaded\n", + "\ttf/0.1/title.tf...downloaded\n", + "\tOK\n" + ] + }, + { + "data": { + "text/html": [ + "data: ~/text-fabric-data/annotation/banks/tf/0.1" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "\n", + "commit | release | local | base | subdir\n", + "--- | --- | --- | --- | ---\n", + "`a81746c5f9627637db4dae04c2d5348bda9e511a` | `None` | `None` | `~/text-fabric-data` | `annotation/banks/tf`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "do(\n", + " checkoutRepo(\n", + " org=ORG,\n", + " repo=REPO,\n", + " folder=MAIN,\n", + " version=\"0.1\",\n", + " checkout=\"a81746c5f9627637db4dae04c2d5348bda9e511a\",\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "incorrectly_encoded_metadata": "toc-hr-collapsed=false" + }, + "source": [ + "## *source* and *dest*: an alternative for `~/github` and `~/text-fabric-data`\n", + "\n", + "Everything so far uses the hard-wired `~/github` and `~/text-fabric-data` directories.\n", + "But you can change that:\n", + "\n", + "* pass *source* as a replacement for `~/github`.\n", + "* pass *dest* as a replacement for `~/text-fabric-data`.\n", + "\n", + "**When to use**\n", + "\n", + "> if you do not want to interfere with the `~/text-fabric-data` directory.\n", + "\n", + "Text-Fabric manages the `~/text-fabric-data` directory,\n", + "and if you are experimenting outside Text-Fabric\n", + "you may not want to touch its data directory.\n", + "\n", + "> if you want to clone data into your `~/github` directory.\n", + "\n", + "Normally, TF uses your `~/github` directory as a source of information,\n", + "and never writes into it.\n", + "But if you explicitly pass `dest=~/github`, things change: downloads will\n", + "arrive under `~/github`. Use this with care.\n", + "\n", + "> if you work with cloned data outside your `~/github` directory,\n", + "\n", + "you can let the system look in *source* instead of `~/github`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We customize source and destination directories:\n", + "\n", + "* we put them both under `~/Downloads`\n", + "* we give them different names" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "MY_GH = \"~/Downloads/repoclones\"\n", + "MY_TFD = \"~/Downloads/textbase\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Download a fresh copy of the data to `~/Downloads/textbase` instead." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The requested data is not available offline\n", + "rate limit is 5000 requests per hour, with 4891 left for this hour\n", + "\tconnecting to online GitHub repo annotation/banks ... connected\n", + "\tdownloading https://github.com/annotation/banks/releases/download/v2.0/tf-0.2.zip ... \n", + "\tunzipping ... \n", + "\tsaving data\n" + ] + }, + { + "data": { + "text/html": [ + "data: ~/Downloads/textbase/annotation/banks/tf/0.2" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "\n", + "commit | release | local | base | subdir\n", + "--- | --- | --- | --- | ---\n", + "`9713e71c18fd296cf1860d6411312f9127710ba7` | `v2.0` | `None` | `~/Downloads/textbase` | `annotation/banks/tf`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "do(\n", + " checkoutRepo(\n", + " org=ORG,\n", + " repo=REPO,\n", + " folder=MAIN,\n", + " version=\"0.2\",\n", + " checkout=\"\",\n", + " source=MY_GH,\n", + " dest=MY_TFD,\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lookup the same data locally." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "data: ~/Downloads/textbase/annotation/banks/tf/0.2" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "\n", + "commit | release | local | base | subdir\n", + "--- | --- | --- | --- | ---\n", + "`9713e71c18fd296cf1860d6411312f9127710ba7` | `v2.0` | `local` | `~/Downloads/textbase` | `annotation/banks/tf`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "do(\n", + " checkoutRepo(\n", + " org=ORG,\n", + " repo=REPO,\n", + " folder=MAIN,\n", + " version=\"0.2\",\n", + " checkout=\"\",\n", + " source=MY_GH,\n", + " dest=MY_TFD,\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We copy the local github data to the custom location:" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "%%sh\n", + "\n", + "mkdir -p ~/Downloads/repoclones/annotation\n", + "cp -R ~/github/annotation/banks ~/Downloads/repoclones/annotation/banks" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lookup the data in this alternative directory." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "data: ~/Downloads/repoclones/annotation/banks/tf/0.2" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "\n", + "commit | release | local | base | subdir\n", + "--- | --- | --- | --- | ---\n", + "`None` | `None` | `clone` | `~/Downloads/repoclones` | `annotation/banks/tf`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "do(\n", + " checkoutRepo(\n", + " org=ORG,\n", + " repo=REPO,\n", + " folder=MAIN,\n", + " version=\"0.2\",\n", + " checkout=\"clone\",\n", + " source=MY_GH,\n", + " dest=MY_TFD,\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that the directory trees under the customised *source* and *dest* locations have exactly the same shape as before." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "With the help of `checkoutRepo()` you will be able to make local copies of online data in an organized way.\n", + "\n", + "This will help you when\n", + "\n", + "* you use other people's data\n", + "* develop your own data\n", + "* share and publish your data\n", + "* go back in history." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "All chapters:\n", + "\n", + "* [use](use.ipynb)\n", + "* [share](share.ipynb)\n", + "* [app](app.ipynb)\n", + "* *repo*\n", + "* [compose](compose.ipynb)\n", + "\n", + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python3.9", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.0" + }, + "toc-autonumbering": true + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tutorial/repo.py b/tutorial/repo.py new file mode 100644 index 0000000..92cf39c --- /dev/null +++ b/tutorial/repo.py @@ -0,0 +1,476 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.11.4 +# kernelspec: +# display_name: Python3.9 +# language: python +# name: python3 +# --- + +# +# +# +# +# --- +# Start with [convert](https://nbviewer.jupyter.org/github/annotation/banks/blob/master/programs/convert.ipynb) +# +# --- + +# # Getting data from online repos +# +# We show the various automatic ways by which you can get data that is out there on GitHub to your computer. +# +# The work horse is the function `checkoutRepo()` in `tf.applib.repo`. +# +# Text-Fabric uses this function for all operations where data flows from GitHub to your computer. +# +# There are quite some options, and here we explain all the `checkout` options, i.e. the selection of +# data from the history. +# +# See also the [documentation](https://annotation.github.io/text-fabric/tf/advanced/repo.html). + +# %load_ext autoreload +# %autoreload 2 + +# ## Leading example +# +# We use markdown display from IPython purely for presentation. +# It is not needed to run `checkoutRepo()`. + +from tf.advanced.helpers import dm +from tf.advanced.repo import checkoutRepo + +# We work with our tiny example TF app: `banks`. + +ORG = "annotation" +REPO = "banks" +MAIN = "tf" +MOD = "sim/tf" + + +# `MAIN`points to the main data, `MOD` points to a module of data: the similarity feature. + +# ## Presenting the results +# +# The function `do()` just formats the results of a `checkoutRepo()` run. +# +# The result of such a run, after the progress messages, is a tuple. +# For the explanation of the tuple, read the [docs](https://annotation.github.io/text-fabric/tf/advanced/repo.html). + + +def do(task): + md = f""" +commit | release | local | base | subdir +--- | --- | --- | --- | --- +`{task[0]}` | `{task[1]}` | `{task[2]}` | `{task[3]}` | `{task[4]}` +""" + dm(md) + + +# + [markdown] toc-hr-collapsed=false +# ## All the checkout options +# +# We discuss the meaning and effects of the values you can pass to the `checkout` option. +# - + +# ### `clone` +# +# > Look whether the appropriate folder exists under your `~/github` directory. +# +# This is merely a check whether your data exists in the expected location. +# +# * No online checks take place. +# * No data is moved or copied. +# +# **NB**: you cannot select releases and commits in your *local* GitHub clone. +# The data will be used as it is found on your file system. +# +# **When to use** +# +# > If you are developing new feature data. +# +# When you develop your data in a repository, your development is private as long as you +# do not push to GitHub. +# +# You can test your data, even without locally committing your data. +# +# But, if you are ready to share your data, everything is in place, and you only +# have to commit and push, and pass the location on github to others, like +# +# ``` +# myorg/myrepo/subfolder +# ``` + +do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version="0.2", checkout="clone")) + +# We show what happens if you do not have a local github clone in `~/github`. + +# + language="sh" +# +# mv ~/github/annotation/banks/tf ~/github/annotation/banks/tfxxx +# - + +do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version="0.2", checkout="clone")) + +# Note that no attempt is made to retrieve online data. + +# + language="sh" +# +# mv ~/github/annotation/banks/tfxxx ~/github/annotation/banks/tf +# - + +# ### `local` +# +# > Look whether the appropriate folder exists under your `~/text-fabric-data` directory. +# +# This is merely a check whether your data exists in the expected location. +# +# * No online checks take place. +# * No data is moved or copied. +# +# **When to use** +# +# > If you are using data created and shared by others, and if the data +# is already on your system. +# +# You can be sure that no updates are downloaded, and that everything works the same as the last time +# you ran your program. +# +# If you do not already have the data, you have to pass `latest` or `hot` or `''` which will be discussed below. + +do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version="0.2", checkout="local")) + +# You see this data because earlier I have downloaded release `v2.0`, which is a tag for +# the commit with hash `9713e71c18fd296cf1860d6411312f9127710ba7`. + +# If you do not have any corresponding data in your `~/text-fabric-data`, you get this: + +# + language="sh" +# +# mv ~/text-fabric-data/annotation/banks/tf ~/text-fabric-data/annotation/banks/tfxxx +# - + +do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version="0.2", checkout="local")) + +# + language="sh" +# +# mv ~/text-fabric-data/annotation/banks/tfxxx ~/text-fabric-data/annotation/banks/tf +# - + +# ### `''` (default) +# +# This is about when you omit the `checkout` parameter, or pass `''` to it. +# +# The destination for local data is your `~/text-fabric-data` folder. +# +# If you have already a local copy of the data, that will be used. +# +# If not: +# +# > Note that if your local data is outdated, no new data will be downloaded. +# You need `latest` or `hot` for that. +# +# But what is the latest online copy? In this case we mean: +# +# * the latest *release*, and from that release an appropriate attached zip file +# * but if there is no such zip file, we take the files from the corresponding commit +# * but if there is no release at all, we take the files from the *latest commit*. +# +# **When to use** +# +# > If you need data created/shared by other people and you want to be sure that you always have the +# same copy that you initially downloaded. +# +# * If the data provider makes releases after important modifications, you will get those. +# * If the data provider is experimenting after the latest release, and commits them to GitHub, +# you do not get those. +# +# However, with `hot`, you `can` get the latest commit, to be discussed below. + +do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version="0.2", checkout="")) + +# Note that no data has been downloaded, because it has detected that there is already local data on your computer. + +# If you do not have any checkout of this data on your computer, the data will be downloaded. + +# + language="sh" +# +# rm -rf ~/text-fabric-data/annotation/banks/tf +# - + +do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version="0.2", checkout="")) + +# #### Note about versions and releases +# +# The **version** of the data is not necessarily the same concept as the **release** of it. +# +# It is possible to keep the versions and the releases strictly parallel, +# but in text conversion workflows it can be handy to make a distinction between them, +# e.g. as follows: +# +# > the version is a property of the input data +# > the release is a property of the output data +# +# When you create data from sources using conversion algorithms, +# you want to increase the version if you get new input data, e.g. as a result of corrections +# made by the author. +# +# But if you modify your conversion algorithm, while still running it on the same input data, +# you may release the new output data as a **new release** of the **same version**. +# +# Likewise, when the input data stays the same, but you have corrected typos in the metadata, +# you can make a **new release** of the **same version** of the data. +# +# The conversion delivers the features under a specific version, +# and Text-Fabric supports those versions: users of TF can select the version they work with. +# +# Releases are made in the version control system (git and GitHub). +# The part of Text-Fabric that auto-downloads data is aware of releases. +# But once the data has been downloaded in place, there is no machinery in Text-Fabric to handle +# different releases. +# +# Yet the release tag and commit hash are passed on to the point where it comes to recording +# the provenance of the data. + +# #### Download a different version +# +# We download version `0.1` of the data. + +do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version="0.1", checkout="")) + +# Several observations: +# +# * we obtained the older version from the *latest* release, which is still release `v2.0` +# * the download looks different from when we downloaded version `0.2`; +# this is because the data producer has zipped the `0.2` data and has attached it to release `v2.0`, +# but he forgot, or deliberately refused, to attach version `0.1` to that release; +# so it has been retrieved directly from the files in the corresponding commit, which is +# `9713e71c18fd296cf1860d6411312f9127710ba7`. + +# For the verification, an online check is needed. The verification consists of checking the release tag and/or commit hash. +# +# If there is no online connection, you get this: + +# + language="sh" +# +# networksetup -setairportpower en0 off +# - + +do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version="0.1", checkout="latest")) + +# or if you do not have local data: + +# + language="sh" +# +# mv ~/text-fabric-data/annotation/banks/tf/0.1 ~/text-fabric-data/annotation/banks/tf/0.1xxx +# - + +do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version="0.1", checkout="latest")) + +# + language="sh" +# +# mv ~/text-fabric-data/annotation/banks/tf/0.1xxx ~/text-fabric-data/annotation/banks/tf/0.1 + +# + language="sh" +# +# networksetup -setairportpower en0 on +# - + +# ### `latest` +# +# > The latest online release will be identified, +# and if you do not have that copy locally, it will be downloaded. +# +# **When to use** +# +# > If you need data created/shared by other people and you want to be sure that you always have the +# latest *stable* version of that data, unreleased data is not good enough. +# +# One of the difference with `checkout=''` is that if there are no releases, you will not get data. + +do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version="0.2", checkout="latest")) + +# There is no sim/tf data in any release commit, so if we look it up, it should fail. + +do(checkoutRepo(org=ORG, repo=REPO, folder=MOD, version="0.2", checkout="latest")) + +# But with `checkout=''` it will only be found if you do not have local data already: + +do(checkoutRepo(org=ORG, repo=REPO, folder=MOD, version="0.2", checkout="")) + +# In that case there is only one way: `hot`: + +do(checkoutRepo(org=ORG, repo=REPO, folder=MOD, version="0.2", checkout="hot")) + +# ### `hot` +# +# > The latest online commit will be identified, +# and if you do not have that copy locally, it will be downloaded. +# +# **When to use** +# +# > If you need data created/shared by other people and you want to be sure that you always have the +# latest version of that data, whether released or not. +# +# The difference with `checkout=''` is that if there are releases, +# you will now get data that may be newer than the latest release. + +do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version="0.2", checkout="hot")) + +# Observe that data has been downloaded, and that we have now data corresponding to a different commit hash, +# and not corresponding to a release. +# +# If we now ask for the latest *stable* data, the data will be downloaded anew. + +do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version="0.2", checkout="latest")) + +# ### `v1.0` a specific release +# +# > Look for a specific online release to get data from. +# +# **When to use** +# +# > When you want to replicate something, and need data from an earlier point in the history. + +do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version="0.1", checkout="v1.0")) + +# We might try to get version `0.2` from this release. + +do(checkoutRepo(org=ORG, repo=REPO, folder=MAIN, version="0.2", checkout="v1.0")) + +# At that early point in the history there is not yet a version `0.2` of the data. + +# ### `a81746c` a specific commit +# +# > Look for a specific online commit to get data from. +# +# **When to use** +# +# > When you want to replicate something, and need data from an earlier point in the history, and there is no +# release for that commit. + +do( + checkoutRepo( + org=ORG, + repo=REPO, + folder=MAIN, + version="0.1", + checkout="a81746c5f9627637db4dae04c2d5348bda9e511a", + ) +) + +# + [markdown] toc-hr-collapsed=false +# ## *source* and *dest*: an alternative for `~/github` and `~/text-fabric-data` +# +# Everything so far uses the hard-wired `~/github` and `~/text-fabric-data` directories. +# But you can change that: +# +# * pass *source* as a replacement for `~/github`. +# * pass *dest* as a replacement for `~/text-fabric-data`. +# +# **When to use** +# +# > if you do not want to interfere with the `~/text-fabric-data` directory. +# +# Text-Fabric manages the `~/text-fabric-data` directory, +# and if you are experimenting outside Text-Fabric +# you may not want to touch its data directory. +# +# > if you want to clone data into your `~/github` directory. +# +# Normally, TF uses your `~/github` directory as a source of information, +# and never writes into it. +# But if you explicitly pass `dest=~/github`, things change: downloads will +# arrive under `~/github`. Use this with care. +# +# > if you work with cloned data outside your `~/github` directory, +# +# you can let the system look in *source* instead of `~/github`. +# - + +# We customize source and destination directories: +# +# * we put them both under `~/Downloads` +# * we give them different names + +MY_GH = "~/Downloads/repoclones" +MY_TFD = "~/Downloads/textbase" + +# Download a fresh copy of the data to `~/Downloads/textbase` instead. + +do( + checkoutRepo( + org=ORG, + repo=REPO, + folder=MAIN, + version="0.2", + checkout="", + source=MY_GH, + dest=MY_TFD, + ) +) + +# Lookup the same data locally. + +do( + checkoutRepo( + org=ORG, + repo=REPO, + folder=MAIN, + version="0.2", + checkout="", + source=MY_GH, + dest=MY_TFD, + ) +) + +# We copy the local github data to the custom location: + +# + language="sh" +# +# mkdir -p ~/Downloads/repoclones/annotation +# cp -R ~/github/annotation/banks ~/Downloads/repoclones/annotation/banks +# - + +# Lookup the data in this alternative directory. + +do( + checkoutRepo( + org=ORG, + repo=REPO, + folder=MAIN, + version="0.2", + checkout="clone", + source=MY_GH, + dest=MY_TFD, + ) +) + +# Note that the directory trees under the customised *source* and *dest* locations have exactly the same shape as before. + +# ## Conclusion +# +# With the help of `checkoutRepo()` you will be able to make local copies of online data in an organized way. +# +# This will help you when +# +# * you use other people's data +# * develop your own data +# * share and publish your data +# * go back in history. + +# --- +# All chapters: +# +# * [use](use.ipynb) +# * [share](share.ipynb) +# * [app](app.ipynb) +# * *repo* +# * [compose](compose.ipynb) +# +# --- diff --git a/tutorial/share.ipynb b/tutorial/share.ipynb new file mode 100644 index 0000000..9f97d86 --- /dev/null +++ b/tutorial/share.ipynb @@ -0,0 +1,100 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "---\n", + "Start with [convert](https://nbviewer.jupyter.org/github/annotation/banks/blob/master/programs/convert.ipynb)\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Share the Banks example corpus" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "Create release data for annotation/banks/tf\n", + "Found 2 versions\n", + "zip files end up in /Users/dirk/Downloads/annotation-release/banks\n", + "zipping annotation/banks 0.1 with 10 features ==> tf-0.1.zip\n", + "zipping annotation/banks 0.2 with 10 features ==> tf-0.2.zip\n" + ] + } + ], + "source": [ + "%%sh\n", + "\n", + "text-fabric-zip annotation/banks/tf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Share the similarity feature as a module\n", + "\n", + "We do not attach the similarity data to a release, since we intend to update this data quite a few times\n", + "after tweaking the parameters of the algorithm by which we create it.\n", + "\n", + "That means that users will get this data from the latest commit.\n", + "\n", + "So we make sure that the similarity data is committed and pushed to GitHub." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "All chapters:\n", + "\n", + "* [use](use.ipynb)\n", + "* *share*\n", + "* [app](app.ipynb)\n", + "* [repo](repo.ipynb)\n", + "* [compose](compose.ipynb)\n", + "\n", + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tutorial/use.ipynb b/tutorial/use.ipynb new file mode 100644 index 0000000..a2f9cff --- /dev/null +++ b/tutorial/use.ipynb @@ -0,0 +1,790 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "---\n", + "Start with [convert](https://nbviewer.jupyter.org/github/annotation/banks/blob/master/programs/convert.ipynb)\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Use the Banks example corpus" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load TF\n", + "\n", + "We are going to load the new data: all features.\n", + "\n", + "We start a new instance of the TF machinery." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "from tf.fabric import Fabric" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is Text-Fabric 8.3.0\n", + "Api reference : https://annotation.github.io/text-fabric/cheatsheet.html\n", + "\n", + "10 features found and 0 ignored\n" + ] + } + ], + "source": [ + "GH_BASE = os.path.expanduser(\"~/github\")\n", + "ORG = \"annotation\"\n", + "REPO = \"banks\"\n", + "FOLDER = \"tf\"\n", + "TF_DIR = f\"{GH_BASE}/{ORG}/{REPO}/{FOLDER}\"\n", + "\n", + "VERSION = \"0.2\"\n", + "\n", + "TF_PATH = f\"{TF_DIR}/{VERSION}\"\n", + "TF = Fabric(locations=TF_PATH)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We ask for a list of all features:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('author',\n", + " 'gap',\n", + " 'letters',\n", + " 'number',\n", + " 'otype',\n", + " 'punc',\n", + " 'terminator',\n", + " 'title',\n", + " 'oslots')" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "allFeatures = TF.explore(silent=True, show=True)\n", + "loadableFeatures = allFeatures[\"nodes\"] + allFeatures[\"edges\"]\n", + "loadableFeatures" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We load all features:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0.00s loading features ...\n", + " 0.02s All features loaded/computed - for details use loadLog()\n" + ] + } + ], + "source": [ + "api = TF.load(loadableFeatures, silent=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You see that all files are marked with a `T`.\n", + "\n", + "That means that Text-Fabric loads the features by reading the plain text `.tf` files.\n", + "But after reading, it makes a binary equivalent and stores it as a `.tfx`\n", + "file in the hidden `.tf` directory next to it.\n", + "\n", + "Furthermore, you see some lines marked with `C`. Here Text-Fabric is computing derived data,\n", + "mostly about sections, the order of nodes, and the relative positions of nodes with respect to the slots they\n", + "are linked to.\n", + "\n", + "The results of this pre-computation are also stored in that hidden `.tf` directory.\n", + "\n", + "The next time, Text-Fabric loads the data from their binary `.tfx` files, which is much faster.\n", + "And the pre-computation step will be skipped.\n", + "\n", + "If the binary files get outdated Text-Fabric will recompile and recompute everything automatically.\n", + "\n", + "So let's load again." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is Text-Fabric 8.3.0\n", + "Api reference : https://annotation.github.io/text-fabric/cheatsheet.html\n", + "\n", + "10 features found and 0 ignored\n", + " 0.00s loading features ...\n", + " 0.01s All features loaded/computed - for details use loadLog()\n" + ] + } + ], + "source": [ + "TF = Fabric(locations=TF_PATH)\n", + "api = TF.load(loadableFeatures, silent=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Where there were `T`s before, there are now `B`s." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Hoisting\n", + "\n", + "We can access all TF data programmatically by using `A.api.Features`, or `A.api.F` (same thing) and a bunch of\n", + "other API members.\n", + "\n", + "But if we working with a single data source, we hoist those API members to the global namespace.\n", + "\n", + "Now you can directly refer to `F` and friends." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('Computed',\n", + " 'computed-data',\n", + " ('C Computed', 'Call AllComputeds', 'Cs ComputedString')),\n", + " ('Features', 'edge-features', ('E Edge', 'Eall AllEdges', 'Es EdgeString')),\n", + " ('Fabric', 'loading', ('TF',)),\n", + " ('Locality', 'locality', ('L Locality',)),\n", + " ('Nodes', 'navigating-nodes', ('N Nodes',)),\n", + " ('Features',\n", + " 'node-features',\n", + " ('F Feature', 'Fall AllFeatures', 'Fs FeatureString')),\n", + " ('Search', 'search', ('S Search',)),\n", + " ('Text', 'text', ('T Text',))]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "api.makeAvailableIn(globals())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As a result, you have an overview of the names you can use." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exploration\n", + "\n", + "Finally, let's explore this set by means of Text-Fabric.\n", + "\n", + "### Frequency list\n", + "\n", + "We can get ordered frequency lists for the values of all features.\n", + "\n", + "First the words:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(('the', 8),\n", + " ('of', 5),\n", + " ('and', 4),\n", + " ('in', 3),\n", + " ('we', 3),\n", + " ('everything', 2),\n", + " ('know', 2),\n", + " ('most', 2),\n", + " ('ones', 2),\n", + " ('patterns', 2),\n", + " ('us', 2),\n", + " ('Besides', 1),\n", + " ('Culture', 1),\n", + " ('Everything', 1),\n", + " ('So', 1),\n", + " ('a', 1),\n", + " ('about', 1),\n", + " ('aid', 1),\n", + " ('any', 1),\n", + " ('around', 1),\n", + " ('as', 1),\n", + " ('barbarian', 1),\n", + " ('bottom', 1),\n", + " ('can', 1),\n", + " ('care', 1),\n", + " ('climbing', 1),\n", + " ('composed', 1),\n", + " ('control', 1),\n", + " ('dead', 1),\n", + " ('elegant', 1),\n", + " ('enjoyable', 1),\n", + " ('final', 1),\n", + " ('find', 1),\n", + " ('free', 1),\n", + " ('games', 1),\n", + " ('good', 1),\n", + " ('harness', 1),\n", + " ('have', 1),\n", + " ('high', 1),\n", + " ('humans', 1),\n", + " ('impossible', 1),\n", + " ('is', 1),\n", + " ('it', 1),\n", + " ('languages', 1),\n", + " ('left', 1),\n", + " ('life', 1),\n", + " ('line', 1),\n", + " ('make', 1),\n", + " ('mattered', 1),\n", + " ('mountains', 1),\n", + " ('not', 1),\n", + " ('nothing', 1),\n", + " ('our', 1),\n", + " ('over', 1),\n", + " ('own', 1),\n", + " ('problems', 1),\n", + " ('really', 1),\n", + " ('romance', 1),\n", + " ('safety', 1),\n", + " ('societies', 1),\n", + " ('sports', 1),\n", + " ('studying', 1),\n", + " ('such', 1),\n", + " ('take', 1),\n", + " ('terms', 1),\n", + " ('that', 1),\n", + " ('that’s', 1),\n", + " ('things', 1),\n", + " ('those', 1),\n", + " ('to', 1),\n", + " ('truth', 1),\n", + " ('ultimately', 1),\n", + " ('where', 1),\n", + " ('why', 1),\n", + " ('without', 1))" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "F.letters.freqList()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the node types we can get info by calling this:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(('book', 99.0, 100, 100),\n", + " ('chapter', 49.5, 101, 102),\n", + " ('sentence', 33.0, 115, 117),\n", + " ('line', 7.666666666666667, 103, 114),\n", + " ('word', 1, 1, 99))" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "C.levels.data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It means that chapters are 49.5 words long on average, and that the chapter nodes are 101 and 102.\n", + "\n", + "And you see that we have 99 words." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Add to the banks corpus\n", + "\n", + "We are going to make a relationship between each pair of words, and we annotate each related pair with how similar they are.\n", + "\n", + "We measure the similarity by looking at the distinct letters in each word (lowercase), and computing the percentage of\n", + "how many letters they have in common with respect to how many letters they jointly have.\n", + "\n", + "This will become a symmetric edge feature. Symmetric means, that if a and b are similar, then b and a as well, with the\n", + "same similarity.\n", + "\n", + "We only store one copy of each symmetric pair of edges.\n", + "\n", + "We can then use\n", + "[`E.sim.b(node)`](https://annotation.github.io/text-fabric/tf/core/edgefeature.html)\n", + "to find all nodes that are parallel to node.\n", + "\n", + "If words do not have letters in common, their similarity is 0, and we do not make an edge." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "# Preparation\n", + "\n", + "We pre-compute all letter sets for all words." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def makeSet(w):\n", + " return set(F.letters.v(w).lower())" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "lines_to_end_of_cell_marker": 2, + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "99 words\n" + ] + } + ], + "source": [ + "words = {}\n", + "\n", + "for w in F.otype.s(\"word\"):\n", + " words[w] = makeSet(w)\n", + "\n", + "nWords = len(words)\n", + "print(f\"{nWords} words\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "def sim(wSet, vSet):\n", + " return int(round(100 * len(wSet & vSet) / len(wSet | vSet)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "# Compute all similarities\n", + "\n", + "We are going to perform all comparisons.\n", + "\n", + "Since there are 99 words, this will amount to only 5000 comparisons.\n", + "\n", + "For a big corpus, this amount will quickly grow with the number of items to be compared.\n", + "\n", + "See for example the similarities in the\n", + "[Quran](https://nbviewer.jupyter.org/github/q-ran/quran/blob/master/programs/parallels.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def computeSim():\n", + " similarity = {}\n", + "\n", + " wordNodes = sorted(words.keys())\n", + " nWords = len(wordNodes)\n", + "\n", + " nComparisons = nWords * (nWords - 1) // 2\n", + "\n", + " print(f\"{nComparisons} comparisons to make\")\n", + "\n", + " TF.indent(reset=True)\n", + "\n", + " co = 0\n", + " si = 0\n", + " stop = False\n", + " for i in range(nWords):\n", + " nodeI = wordNodes[i]\n", + " wordI = words[nodeI]\n", + " for j in range(i + 1, nWords):\n", + " nodeJ = wordNodes[j]\n", + " wordJ = words[nodeJ]\n", + " s = sim(wordI, wordJ)\n", + " co += 1\n", + " if s:\n", + " similarity[(nodeI, nodeJ)] = sim(wordI, wordJ)\n", + " si += 1\n", + " if stop:\n", + " break\n", + "\n", + " TF.info(f\"{co:>4} comparisons and {si:>4} similarities\")\n", + " return similarity" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4851 comparisons to make\n", + " 0.01s 4851 comparisons and 3332 similarities\n" + ] + } + ], + "source": [ + "similarity = computeSim()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7\n", + "100\n" + ] + } + ], + "source": [ + "print(min(similarity.values()))\n", + "print(max(similarity.values()))" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "eq = [x for x in similarity.items() if x[1] >= 100]\n", + "neq = [x for x in similarity.items() if x[1] <= 50]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "((1, 4), 100)\n", + "((1, 2), 8)\n" + ] + } + ], + "source": [ + "print(eq[0])\n", + "print(neq[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "58\n", + "3247\n" + ] + } + ], + "source": [ + "print(len(eq))\n", + "print(len(neq))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 Everything\n", + "4 everything\n" + ] + } + ], + "source": [ + "print(eq[0][0][0], F.letters.v(eq[0][0][0]))\n", + "print(eq[0][0][1], F.letters.v(eq[0][0][1]))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 Everything\n", + "2 about\n" + ] + } + ], + "source": [ + "print(neq[0][0][0], F.letters.v(neq[0][0][0]))\n", + "print(neq[0][0][1], F.letters.v(neq[0][0][1]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Add parallels to the TF dataset\n", + "\n", + "We now add this information to the Banks dataset as an *edge feature*." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "metaData = {\n", + " \"\": {\n", + " \"name\": \"Banks (similar words)\",\n", + " \"converters\": \"Dirk Roorda\",\n", + " \"sourceUrl\": \"https://nbviewer.jupyter.org/github/annotation/tutorials/blob/master/text-fabric/use.ipynb\",\n", + " \"version\": \"0.2\",\n", + " },\n", + " \"sim\": {\n", + " \"valueType\": \"int\",\n", + " \"edgeValues\": True,\n", + " \"description\": \"similarity between words, as a percentage of the common material wrt the combined material\",\n", + " },\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "simData = {}\n", + "for ((f, t), d) in similarity.items():\n", + " simData.setdefault(f, {})[t] = d" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "FOLDER_SIM = \"sim/tf\"\n", + "path = f\"{ORG}/{REPO}/{FOLDER_SIM}\"\n", + "location = f\"{GH_BASE}/{path}\"\n", + "module = VERSION" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0.00s Exporting 0 node and 1 edge and 0 config features to ~/github/annotation/banks/sim/tf/0.2:\n", + " | 0.01s T sim to ~/github/annotation/banks/sim/tf/0.2\n", + " 0.01s Exported 0 node features and 1 edge features and 0 config features to ~/github/annotation/banks/sim/tf/0.2\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "TF.save(\n", + " edgeFeatures=dict(sim=simData), metaData=metaData, location=location, module=module\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "All chapters:\n", + "\n", + "* *use*\n", + "* [share](share.ipynb)\n", + "* [app](app.ipynb)\n", + "* [repo](repo.ipynb)\n", + "* [compose](compose.ipynb)\n", + "\n", + "---\n", + "\n", + "CC-BY Dirk Roorda" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tutorial/use.py b/tutorial/use.py new file mode 100644 index 0000000..eabd6ad --- /dev/null +++ b/tutorial/use.py @@ -0,0 +1,272 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.11.4 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +# +# +# +# +# --- +# Start with [convert](https://nbviewer.jupyter.org/github/annotation/banks/blob/master/programs/convert.ipynb) +# +# --- + +# # Use the Banks example corpus + +# ## Load TF +# +# We are going to load the new data: all features. +# +# We start a new instance of the TF machinery. + +# %load_ext autoreload +# %autoreload 2 + +# + +import os + +from tf.fabric import Fabric + +# + +GH_BASE = os.path.expanduser("~/github") +ORG = "annotation" +REPO = "banks" +FOLDER = "tf" +TF_DIR = f"{GH_BASE}/{ORG}/{REPO}/{FOLDER}" + +VERSION = "0.2" + +TF_PATH = f"{TF_DIR}/{VERSION}" +TF = Fabric(locations=TF_PATH) +# - + +# We ask for a list of all features: + +allFeatures = TF.explore(silent=True, show=True) +loadableFeatures = allFeatures["nodes"] + allFeatures["edges"] +loadableFeatures + +# We load all features: + +api = TF.load(loadableFeatures, silent=False) + +# You see that all files are marked with a `T`. +# +# That means that Text-Fabric loads the features by reading the plain text `.tf` files. +# But after reading, it makes a binary equivalent and stores it as a `.tfx` +# file in the hidden `.tf` directory next to it. +# +# Furthermore, you see some lines marked with `C`. Here Text-Fabric is computing derived data, +# mostly about sections, the order of nodes, and the relative positions of nodes with respect to the slots they +# are linked to. +# +# The results of this pre-computation are also stored in that hidden `.tf` directory. +# +# The next time, Text-Fabric loads the data from their binary `.tfx` files, which is much faster. +# And the pre-computation step will be skipped. +# +# If the binary files get outdated Text-Fabric will recompile and recompute everything automatically. +# +# So let's load again. + +TF = Fabric(locations=TF_PATH) +api = TF.load(loadableFeatures, silent=False) + +# Where there were `T`s before, there are now `B`s. + +# ### Hoisting +# +# We can access all TF data programmatically by using `A.api.Features`, or `A.api.F` (same thing) and a bunch of +# other API members. +# +# But if we working with a single data source, we hoist those API members to the global namespace. +# +# Now you can directly refer to `F` and friends. + +api.makeAvailableIn(globals()) + +# As a result, you have an overview of the names you can use. + +# ## Exploration +# +# Finally, let's explore this set by means of Text-Fabric. +# +# ### Frequency list +# +# We can get ordered frequency lists for the values of all features. +# +# First the words: + +F.letters.freqList() + +# For the node types we can get info by calling this: + +C.levels.data + + +# It means that chapters are 49.5 words long on average, and that the chapter nodes are 101 and 102. +# +# And you see that we have 99 words. + +# # Add to the banks corpus +# +# We are going to make a relationship between each pair of words, and we annotate each related pair with how similar they are. +# +# We measure the similarity by looking at the distinct letters in each word (lowercase), and computing the percentage of +# how many letters they have in common with respect to how many letters they jointly have. +# +# This will become a symmetric edge feature. Symmetric means, that if a and b are similar, then b and a as well, with the +# same similarity. +# +# We only store one copy of each symmetric pair of edges. +# +# We can then use +# [`E.sim.b(node)`](https://annotation.github.io/text-fabric/tf/core/edgefeature.html) +# to find all nodes that are parallel to node. +# +# If words do not have letters in common, their similarity is 0, and we do not make an edge. + +# # Preparation +# +# We pre-compute all letter sets for all words. + + +def makeSet(w): + return set(F.letters.v(w).lower()) + + +# + +words = {} + +for w in F.otype.s("word"): + words[w] = makeSet(w) + +nWords = len(words) +print(f"{nWords} words") + + +# - + + +def sim(wSet, vSet): + return int(round(100 * len(wSet & vSet) / len(wSet | vSet))) + + +# # Compute all similarities +# +# We are going to perform all comparisons. +# +# Since there are 99 words, this will amount to only 5000 comparisons. +# +# For a big corpus, this amount will quickly grow with the number of items to be compared. +# +# See for example the similarities in the +# [Quran](https://nbviewer.jupyter.org/github/q-ran/quran/blob/master/programs/parallels.ipynb). + + +def computeSim(): + similarity = {} + + wordNodes = sorted(words.keys()) + nWords = len(wordNodes) + + nComparisons = nWords * (nWords - 1) // 2 + + print(f"{nComparisons} comparisons to make") + + TF.indent(reset=True) + + co = 0 + si = 0 + stop = False + for i in range(nWords): + nodeI = wordNodes[i] + wordI = words[nodeI] + for j in range(i + 1, nWords): + nodeJ = wordNodes[j] + wordJ = words[nodeJ] + s = sim(wordI, wordJ) + co += 1 + if s: + similarity[(nodeI, nodeJ)] = sim(wordI, wordJ) + si += 1 + if stop: + break + + TF.info(f"{co:>4} comparisons and {si:>4} similarities") + return similarity + + +similarity = computeSim() + +print(min(similarity.values())) +print(max(similarity.values())) + +eq = [x for x in similarity.items() if x[1] >= 100] +neq = [x for x in similarity.items() if x[1] <= 50] + +print(eq[0]) +print(neq[0]) + +print(len(eq)) +print(len(neq)) + +print(eq[0][0][0], F.letters.v(eq[0][0][0])) +print(eq[0][0][1], F.letters.v(eq[0][0][1])) + +print(neq[0][0][0], F.letters.v(neq[0][0][0])) +print(neq[0][0][1], F.letters.v(neq[0][0][1])) + +# # Add parallels to the TF dataset +# +# We now add this information to the Banks dataset as an *edge feature*. + +metaData = { + "": { + "name": "Banks (similar words)", + "converters": "Dirk Roorda", + "sourceUrl": "https://nbviewer.jupyter.org/github/annotation/tutorials/blob/master/text-fabric/use.ipynb", + "version": "0.2", + }, + "sim": { + "valueType": "int", + "edgeValues": True, + "description": "similarity between words, as a percentage of the common material wrt the combined material", + }, +} + +simData = {} +for ((f, t), d) in similarity.items(): + simData.setdefault(f, {})[t] = d + +FOLDER_SIM = "sim/tf" +path = f"{ORG}/{REPO}/{FOLDER_SIM}" +location = f"{GH_BASE}/{path}" +module = VERSION + +TF.save( + edgeFeatures=dict(sim=simData), metaData=metaData, location=location, module=module +) + +# --- +# All chapters: +# +# * *use* +# * [share](share.ipynb) +# * [app](app.ipynb) +# * [repo](repo.ipynb) +# * [compose](compose.ipynb) +# +# --- +# +# CC-BY Dirk Roorda