diff --git a/README.md b/README.md index a4b37a0..1db0b8f 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,64 @@ # descartes-tf Letters of Descartes in Text-Fabric with math display. +[![Project Status: WIP – Initial development is in progress, but there has not yet been a stable, usable release suitable for the public.](https://www.repostatus.org/badges/latest/wip.svg)](https://www.repostatus.org/#wip) + +![descartes](docs/images/logo.png) + +# René Descartes - Brieven + +In this repository we prepare the letters of +[Descartes](https://en.wikipedia.org/wiki/René_Descartes) +for the application of data science. + +The source files are provided by the Huygens Institute, as the result of the CKCC project which was completed +in 2012. + +From there we converted it to a +[Text-Fabric](https://github.com/annotation/text-fabric) +representation. + +The result can be readily loaded into Python programs for further processing. + +See [about](about.md) for the provenance of the data. + +See [transcription](transcription.md) for how the resulting data is modelled. + +## How to use + +### Having Text-Fabric installed + +This data can be processed by +[Text-Fabric](https://annotation.github.io/text-fabric/tf). + +Text-Fabric will automatically download the corpus data. + +After [installing Text-Fabric](https://annotation.github.io/text-fabric/tf/about/install.html), +you can start the Text-Fabric browser by this command + +```sh +text-fabric CLARIAH/descartes-tei +``` + +Alternatively, you can work in a Jupyter notebook and say + +```python +from tf.app import use + +A = use('CLARIAH/descartes-tei') +``` + +In both cases the data is downloaded and ends up in your home directory, +under `text-fabric-data`. + +See also +[start](https://nbviewer.jupyter.org/github/CLARIAH/descartes-tei/blob/master/tutorial/start.ipynb) +and +[search](https://nbviewer.jupyter.org/github/CLARIAH/descartes-tei/blob/master/tutorial/search.ipynb). + +# Author + +See [about](about.md) for the authors/editors of the data. + +[Dirk Roorda](https://github.com/dirkroorda) is the author of the representation in Text-Fabric of the data, +and the tutorials and documentation. diff --git a/app/app.py b/app/app.py index 22a3739..d0942df 100644 --- a/app/app.py +++ b/app/app.py @@ -1,17 +1,40 @@ +import types from tf.advanced.find import loadModule from tf.advanced.app import App +MODIFIERS = "italic margin sub sup".strip().split() + + +def fmt_layoutOrig(app, n, **kwargs): + return app._wrapHtml(n, None) + + class TfApp(App): def __init__(app, *args, silent=False, **kwargs): + app.fmt_layoutOrig = types.MethodType(fmt_layoutOrig, app) + super().__init__(*args, silent=silent, **kwargs) + app.image = loadModule("image", *args) app.image.getImagery(app, silent, checkout=kwargs.get("checkout", "")) app.reinit() - # PRETTY HELPERS + # FORMAT suppport + + def _wrapHtml(app, n, kind): + api = app.api + F = api.F + Fs = api.Fs + trans = F.trans.v(n) or "" + punc = F.punc.v(n) or "" + material = f"{trans}{punc}" + clses = " ".join(cf for cf in MODIFIERS if Fs(f"is{cf}").v(n)) + return f'{material}' if clses else f"{material}" + + # GRAPHICS Support def getGraphics(app, isPretty, n, nType, outer): result = "" diff --git a/app/config.yaml b/app/config.yaml index 465e9cf..1d2dbbe 100644 --- a/app/config.yaml +++ b/app/config.yaml @@ -1,4 +1,9 @@ apiVersion: 3 +dataDisplay: + exampleSectionHtml: letter 1:1001 + textFormats: + layout-orig-full: + method: layoutOrig docs: docPage: about featureBase: 'https://github.com/{org}/{repo}/blob/master/docs/transcription{docExt}' @@ -6,18 +11,36 @@ docs: interfaceDefaults: showGraphics: true showMath: true + standardFeatures: false + withLabels: true provenanceSpec: corpus: Descartes = Descartes, all letters graphicsRelative: source/illustrations - version: 0.9 + version: 1.0 webBase: http://emlo-portal.bodleian.ox.ac.uk/collections/?catalogue=rene-descartes webHint: See how this corpus is included in the Bodleian catalog + moduleSpecs: + - corpus: Similar Sentences + relative: parallels/tf typeDisplay: volume: - featuresBare: n + label: '{n}' + template: 'vol. {n}' + page: + label: '{n}' + template: 'p. {n}' letter: - featuresBare: id + label: '{id} {date} from {sender} to {recipient}' + template: '{id} {date} from {sender} to {recipient}' + features: senderloc recipientloc p: - featuresBare: n + label: '{n}' + sentence: + label: '{n}' + condense: true figure: + label: '{url}' graphics: true + formula: + label: '{notation}' + features: tex diff --git a/app/static/display.css b/app/static/display.css new file mode 100644 index 0000000..d7d465b --- /dev/null +++ b/app/static/display.css @@ -0,0 +1,17 @@ +.italic { + font-style: italic; +} +.margin { + position: relative; + top: -0.3em; + font-weight: bold; + color: #0000ee; +} +.sub { + vertical-align: sub; + font-size: small; +} +.sup { + vertical-align: super; + font-size: small; +} diff --git a/docs/transcription.md b/docs/transcription.md index 65bb1a0..3d97d0c 100644 --- a/docs/transcription.md +++ b/docs/transcription.md @@ -22,6 +22,24 @@ postscriptum of the letter. Letters may contain illustrations, symbols, and mathematical formulas. +### Sentences + +We have added the concept of sentence. +A sentence is a piece of text within a paragraph that is +terminated by a `.` . + +Not all `.`s act as sentence terminator, though, e.g. in +`Kal. Aprilis` it marks an abbreviation. + +We have tried to exclude most of these cases. + +The purpose of adding sentences was to have a convenient +division within paragraphs. This division can be used to +display manageable chunks of the corpus. + +It can also be used to detect parallel passages, i.e. pieces +where W.F. Hermans repeats himself. + ## Text-Fabric model The Text-Fabric model views the text as a series of atomic units, called @@ -30,24 +48,11 @@ The Text-Fabric model views the text as a series of atomic units, called On top of that, more complex textual objects can be represented as *nodes*. In this corpus we have node types for: -volume 9 75811.11 100 -letter 725 941.10 100 -page 2884 236.58 100 -p 8438 80.86 100 -postscriptum 56 46.79 0 -head 725 23.37 2 -address 86 15.22 0 -closer 541 13.10 1 -hi 5972 4.63 4 -opener 545 1.97 0 -formula 6200 1.27 1 -figure 319 1.00 0 -word 682300 1.00 100 - [*word*](#node-type-word), [*hi*](#node-type-hi), [*figure*](#node-type-figure), [*formula*](#node-type-formula), +[*sentence*](#node-type-sentence), [*head*](#node-type-head), [*opener*](#node-type-opener), [*closer*](#node-type-closer), @@ -109,7 +114,7 @@ feature | values | description **ismargin** | `1` | indicates the word is in the margin **issub** | `1` | indicates the word is in subscript **issup** | `1` | indicates the word is in superscript -**typ** | `empty` | indicates the kind of word +**typ** | `empty` `formula` | indicates the kind of word * **typ** = `empty`: deliberately empty word, i.e. **trans** is empty or absent, however, **punc** may contain something, typically a space @@ -147,7 +152,18 @@ This gives you the opportunity to view the source code of formulas. feature | values | description ------- | ------ | ------ -**notation** | `A\over B` | TeX source code of a formula +**notation** | `TeX` | notation method of the formula +**tex** | `A\over B` | TeX source code of a formula + +## Node type [*sentence*](#sentence) + +Sentence, i.e. a part in a paragraph terminated by a full stop. +`.` that are used for other purposes do not count as a full stop, +e.g. in abbreviations and numbers. + +feature | values | description +------- | ------ | ------ +**n** | `1` `2` | sequence number of a sentence within the paragraph. ## Node type [*head*](#head) @@ -230,6 +246,7 @@ The following text formats are defined (you can also list them with `T.formats`) format | description --- | --- `text-orig-full` | the full text of all words +`layout-orig-full` | the full text of all words, with special formatting indicating special characteristics of the text. The formats with `text` result in strings that are plain text, without additional formatting. diff --git a/parallels/tf/1.0/sim.tf b/parallels/tf/1.0/sim.tf new file mode 100644 index 0000000..394ce70 --- /dev/null +++ b/parallels/tf/1.0/sim.tf @@ -0,0 +1,375 @@ +@edge +@edgeValues +@author=René Descartes +@contributors=Erik-Jan Bos; Katsuzo Murakami (University of Tokyo); Meguru Sasaki (École normale superieure d'Hokkaido); Takehumi Tokoro (University of Chyuo) +@converters=Dirk Roorda (Text-Fabric) +@description=similarity between sentences based on the Levenshtein ratio +@descriptionTf=Critical edition with various variants +@institute=KNAW/Huygens Amsterdam +@language=nld +@sourceFormat=TEI +@valueType=int +@writtenBy=Text-Fabric +@dateWritten=2023-01-12T15:51:13Z + +708472 716876 100 +708666 708695 83 +708669 708698 96 +709036 718572 81 +709036 709056,713182 90 +709056 713182,718572 84 +709189 709300 100 +710056 716424,717989 100 +710153 710696 87 +710294 717986 88 +710294 712761,714571 100 +710336 714134,714137 86 +710336 710346,710358,710383,710393,710398,713034,714066,716476,719448,719454,719459 100 +710346 714134,714137 86 +710346 710358,710383,710393,710398,713034,714066,716476,719448,719454,719459 100 +710358 714134,714137 86 +710358 710383,710393,710398,713034,714066,716476,719448,719454,719459 100 +710383 714134,714137 86 +710383 710393,710398,713034,714066,716476,719448,719454,719459 100 +710393 714134,714137 86 +710393 710398,713034,714066,716476,719448,719454,719459 100 +710398 714134,714137 86 +710398 713034,714066,716476,719448,719454,719459 100 +710514 710543,710566,710570,710580,710588,710594,710603 100 +710543 710566,710570,710580,710588,710594,710603 100 +710566 710570,710580,710588,710594,710603 100 +710570 710580,710588,710594,710603 100 +710580 710588,710594,710603 100 +710588 710594,710603 100 +710594 710603 100 +710639 712333 81 +712335 92 +712336 97 +710659 710675,712362 100 +710673 712359 95 +710675 712362 100 +710864 710998 81 +711073 721391 84 +711073 711104,711112,721377 100 +711076 713770 81 +711076 711108 85 +711099 711115 100 +711104 721391 84 +711104 711112,721377 100 +711112 721391 84 +711112 721377 100 +711171 712283 81 +711244 711248 93 +711244 711246 96 +711247 97 +711248 92 +711250 711254,711258 98 +711259 97 +711251 711255 100 +711260 88 +711252 711256 100 +711257 95 +711258 100 +711259 97 +711260 88 +711262 711268 83 +711262 711265 100 +711266 94 +711265 711268 83 +711327 711575 89 +711924 711961 82 +711964 711970,711978 100 +711967 711975 82 +711970 711978 100 +712021 712023 80 +712023 712025 81 +712148 712212 89 +712154 712218 96 +712178 712269 80 +712239 712649 87 +712239 712647 98 +712366 712381 88 +712548 712873 80 +712583 712601,712605,712612,712617 84 +712583 712588,712593,712646,712650,712659,712675,712683,712689,712703 89 +712583 712663 100 +712588 712601,712605,712612,712617 84 +712588 712593,712646,712650,712659,712663,712683,712689,712703 89 +712588 712675 100 +712593 712601,712605,712612,712617 84 +712593 712646,712650,712659,712663,712675,712689,712703 89 +712593 712683 100 +712601 712646,712650,712659,712663,712675,712683,712689,712703 84 +712601 712605,712612,712617 90 +712605 712646,712650,712659,712663,712675,712683,712689,712703 84 +712605 712612,712617 90 +712612 712650,712659,712663,712675,712683,712689,712703 84 +712612 712617 90 +712612 712646 95 +712617 712646,712659,712663,712675,712683,712689,712703 84 +712617 712650 95 +712646 712650,712659,712663,712675,712683,712689,712703 89 +712649 85 +712650 712659,712663,712675,712683,712689,712703 89 +712659 712663,712675,712683,712689,712703 89 +712663 712675,712683,712689,712703 89 +712675 712683,712689,712703 89 +712683 712689,712703 89 +712689 712703 89 +712761 717986 88 +712761 714571 100 +712792 712803 100 +713030 713056,713058,713063,713065,713067,713071,713075,713078,713082,713084 80 +713030 713092,713103,713106,722270 88 +713030 713040,713053,713060,713073,713080 93 +713034 714134,714137 86 +713034 714066,716476,719448,719454,719459 100 +713040 713087,713090,713094,713096,713098,713100,713103,713106,722270 82 +713040 716569 84 +713040 713053,713056,713060,713073,713078,713080 88 +713040 713092 94 +713053 713092,713103,713106,722270 82 +713053 716583 84 +713053 713060,713065,713073,713080,713084 88 +713056 713087,713090,713092,713094,713096,713098,713100,713103,722270,722273 82 +713056 713063,713078,713084 88 +713058 713087,713090,713096 82 +713058 713060,713063,713065,713067,713071 88 +713058 713094 94 +713060 713090,713092,713094,713096,713103,713106,722270 82 +713060 713063,713065,713071,713073,713080 88 +713063 713087,713090,713094,713096,713103,722270,722273 82 +713063 713065,713071,713084 88 +713065 713090,713094,713096 82 +713065 713071,713084 88 +713067 713087,713094,713098 82 +713067 713071 88 +713071 713090,713094,713096,713098 82 +713073 713092,713096,713100,713103,713106,722270 82 +713073 713075,713080 88 +713075 713092,713096,713100,713103,722270 82 +713075 713082 88 +713075 713106 94 +713078 713087,713090,713092,713094,713096,713098,713100,722273 82 +713078 713080,713082 88 +713080 713092,713098,713103,713106,722270,722273 82 +713080 713082 88 +713082 713092,713098,713106,722273 82 +713084 713087,713103,722270,722273 82 +713087 713094 89 +713090 713094,713096 89 +713092 716569 80 +713092 713106 89 +713094 713096 89 +713096 713100 89 +713098 716571 80 +713103 716575,716579,716585 80 +713103 713106,722270 89 +713106 716579 80 +713106 722270 89 +713543 713544 91 +713950 721230 100 +714064 714872,715492 100 +715493 83 +714134,714137 86 +714066 716476,719448,719454,719459 100 +714134 716476,719448,719454,719459 86 +714134 714137 100 +714137 716476,719448,719454,719459 86 +714192 721179 80 +714192 721778 83 +714192 714194,722090,722586 91 +714192 719352 100 +714194 722090 83 +714194 719352 91 +714194 722586 100 +714281 715036,715040,715312,715487,717464,717614,717845,717889,717897,717936,717948,718451,718580,719038,719180,719191,719540,719660,720327,720504,720676,720692,720840,721148,721271,721356,721395,722705 100 +714340 714348 87 +714345 714357 86 +714571 717986 88 +714663 714876,718552 88 +714663 718548 100 +714665 718670,721498,722607,722660,722663 100 +714872 715492 100 +714876 718548 88 +715036 715040,715312,715487,717464,717614,717845,717889,717897,717936,717948,718451,718580,719038,719180,719191,719540,719660,720327,720504,720676,720692,720840,721148,721271,721356,721395,722705 100 +715040 715312,715487,717464,717614,717845,717889,717897,717936,717948,718451,718580,719038,719180,719191,719540,719660,720327,720504,720676,720692,720840,721148,721271,721356,721395,722705 100 +715171 716730,716921 100 +715312 715487,717464,717614,717845,717889,717897,717936,717948,718451,718580,719038,719180,719191,719540,719660,720327,720504,720676,720692,720840,721148,721271,721356,721395,722705 100 +715487 717464,717614,717845,717889,717897,717936,717948,718451,718580,719038,719180,719191,719540,719660,720327,720504,720676,720692,720840,721148,721271,721356,721395,722705 100 +715888 715955 81 +715956 97 +715894 715984 92 +716041 716046,716054 80 +716046 716054 80 +716235 716240,720980 100 +716240 720980 100 +716317 716334 80 +716336 716338 81 +716424 717989 100 +716470 721778 80 +716470 716472 100 +716472 721778 80 +716476 719448,719454,719459 100 +716501 716504 85 +716518 716526 84 +716567 716603 82 +716567 716569,716571,716573,716575,716577,716579,716581,716583,716585,716590,716592,716594,716596,716598 86 +716567 716588,716601 95 +716569 716585,716588,716590,716592,716596,716598,716601 82 +716569 716603 87 +716569 716571,716573,716575,716577,716579,716581,716583,716594 91 +716571 716573,716575,716577,716579,716583,716585,716588,716590,716594,716596,716598,716601 82 +716571 716581,716592 91 +716571 716603 96 +716573 716585,716588,716590,716592,716596,716598,716601 82 +716573 716575,716577,716579,716581,716583,716594 91 +716575 716590,716596,716598,716601 82 +716575 716577,716579,716581,716583,716585,716588,716592,716594 91 +716577 716585,716588,716590,716592,716598,716601 82 +716577 716579,716581,716583,716594,716596 91 +716579 722270 80 +716579 716588,716592,716596,716598,716601 82 +716579 716581,716583,716585,716590,716594 91 +716581 716585,716588,716590,716596,716598,716601 82 +716581 716603 87 +716581 716583,716592,716594 91 +716583 716585,716588,716590,716592 82 +716583 716594,716596,716598,716601 91 +716585 722270 80 +716585 716594,716596,716598,716601 82 +716585 716588,716590,716592 91 +716588 716590,716594,716596,716598 82 +716588 716592,716601 91 +716590 716592,716594,716596,716598,716601 82 +716590 716603 87 +716592 722273 80 +716592 716594,716596,716598,716601 82 +716592 716603 87 +716594 716596,716598,716601 91 +716596 716598,716601 91 +716598 716601 91 +716730 716921 100 +716943 716953 100 +716975 716977 82 +716977 716986 88 +716990 716994 80 +717160 718202 100 +718203 100 +718204 100 +718205 100 +718206 100 +718207 99 +718208 99 +718209 100 +718210 100 +718211 100 +718212 100 +717410 717609 91 +717464 717614,717845,717889,717897,717936,717948,718451,718580,719038,719180,719191,719540,719660,720327,720504,720676,720692,720840,721148,721271,721356,721395,722705 100 +717614 717845,717889,717897,717936,717948,718451,718580,719038,719180,719191,719540,719660,720327,720504,720676,720692,720840,721148,721271,721356,721395,722705 100 +717845 717889,717897,717936,717948,718451,718580,719038,719180,719191,719540,719660,720327,720504,720676,720692,720840,721148,721271,721356,721395,722705 100 +717889 717897,717936,717948,718451,718580,719038,719180,719191,719540,719660,720327,720504,720676,720692,720840,721148,721271,721356,721395,722705 100 +717897 717936,717948,718451,718580,719038,719180,719191,719540,719660,720327,720504,720676,720692,720840,721148,721271,721356,721395,722705 100 +717936 717948,718451,718580,719038,719180,719191,719540,719660,720327,720504,720676,720692,720840,721148,721271,721356,721395,722705 100 +717948 718451,718580,719038,719180,719191,719540,719660,720327,720504,720676,720692,720840,721148,721271,721356,721395,722705 100 +718006 718138,718142,718150 100 +718138 718142,718150 100 +718142 718150 100 +718451 718580,719038,719180,719191,719540,719660,720327,720504,720676,720692,720840,721148,721271,721356,721395,722705 100 +718548 718552 88 +718580 719038,719180,719191,719540,719660,720327,720504,720676,720692,720840,721148,721271,721356,721395,722705 100 +718605 721289 99 +718655 722217,722219 100 +718742 80 +718656 722451 89 +718656 718757,718963,718968,719256,719341 100 +718668 722515 80 +718668 721531 82 +718670 721498,722607,722660,722663 100 +718742 718757,718963,718968,719256,719341 80 +718757 722451 89 +718757 718963,718968,719256,719341 100 +718963 722451 89 +718963 718968,719256,719341 100 +718968 722451 89 +718968 719256,719341 100 +719038 719180,719191,719540,719660,720327,720504,720676,720692,720840,721148,721271,721356,721395,722705 100 +719180 719191,719540,719660,720327,720504,720676,720692,720840,721148,721271,721356,721395,722705 100 +719191 719540,719660,720327,720504,720676,720692,720840,721148,721271,721356,721395,722705 100 +719256 722451 89 +719256 719341 100 +719337 719409 100 +719341 722451 89 +719352 721179 80 +719352 721778 83 +719352 722090,722586 91 +719355 722606 80 +719448 719454,719459 100 +719454 719459 100 +719540 719660,720327,720504,720676,720692,720840,721148,721271,721356,721395,722705 100 +719637 719640 83 +719660 720327,720504,720676,720692,720840,721148,721271,721356,721395,722705 100 +719765 719842,719917 82 +719765 719960 84 +719765 719810,719923 89 +719765 719892 90 +719765 719788,719880,719934 95 +719788 719960 80 +719788 719810,719923 84 +719788 719880,719934 90 +719788 719892 95 +719810 719892 80 +719810 719917 82 +719810 719880,719934 84 +719810 719923 89 +719810 719842 94 +719810 719960 95 +719842 719923 82 +719842 719892 84 +719842 719917 88 +719842 719880,719960 89 +719880 719960 80 +719880 719923 84 +719880 719934 90 +719880 719892 95 +719892 719923 80 +719892 719934 86 +719917 719934,719960 89 +719917 719923 94 +719923 719934,719960 95 +719934 719960 90 +720022 720027,720036,720050,720055,720065 100 +720027 720036,720050,720055,720065 100 +720033 720041,720045,720059 100 +720036 720050,720055,720065 100 +720041 720045,720059 100 +720045 720059 100 +720050 720055,720065 100 +720055 720065 100 +720327 720504,720676,720692,720840,721148,721271,721356,721395,722705 100 +720504 720676,720692,720840,721148,721271,721356,721395,722705 100 +720676 720692,720840,721148,721271,721356,721395,722705 100 +720692 720840,721148,721271,721356,721395,722705 100 +720840 721148,721271,721356,721395,722705 100 +721066 722088 100 +721148 721271,721356,721395,722705 100 +721271 721356,721395,722705 100 +721356 721395,722705 100 +721377 721391 84 +721395 722705 100 +721498 722607,722660,722663 100 +721752 721954 98 +721847 721871 90 +721890 721893 90 +721893 721951 80 +722089 722614 82 +722089 722572 85 +722586 83 +722140 722151 89 +722217 722219 100 +722572 722614 96 +722606 722626 94 +722660,722663 100 +722660 722663 100 +722664 100 diff --git a/programs/illustrations/illustration-682575.gif b/programs/illustrations/illustration-682575.gif new file mode 100644 index 0000000..651b766 Binary files /dev/null and b/programs/illustrations/illustration-682575.gif differ diff --git a/programs/illustrations/illustration-682632.gif b/programs/illustrations/illustration-682632.gif new file mode 100644 index 0000000..e32b4a6 Binary files /dev/null and b/programs/illustrations/illustration-682632.gif differ diff --git a/programs/illustrations/illustration-682657.gif b/programs/illustrations/illustration-682657.gif new file mode 100644 index 0000000..1dcc614 Binary files /dev/null and b/programs/illustrations/illustration-682657.gif differ diff --git a/programs/illustrations/illustration-682658.gif b/programs/illustrations/illustration-682658.gif new file mode 100644 index 0000000..886f0b4 Binary files /dev/null and b/programs/illustrations/illustration-682658.gif differ diff --git a/programs/illustrations/illustration-682659.gif b/programs/illustrations/illustration-682659.gif new file mode 100644 index 0000000..1b7fd6d Binary files /dev/null and b/programs/illustrations/illustration-682659.gif differ diff --git a/programs/parallels.ipynb b/programs/parallels.ipynb index 1b2f1d0..7ff5f6a 100644 --- a/programs/parallels.ipynb +++ b/programs/parallels.ipynb @@ -20,6 +20,7 @@ "import collections\n", "import pickle\n", "import gzip\n", + "import re\n", "\n", "from tf.app import use" ] @@ -32,7 +33,7 @@ { "data": { "text/html": [ - "TF-app: ~/gitlab.huc.knaw.nl/hermans/works/app" + "TF-app: ~/github/CLARIAH/descartes-tf/app" ], "text/plain": [ "" @@ -44,7 +45,7 @@ { "data": { "text/html": [ - "data: ~/gitlab.huc.knaw.nl/hermans/works/tf/0.4" + "data: ~/github/CLARIAH/descartes-tf/tf/1.0" ], "text/plain": [ "" @@ -56,183 +57,285 @@ { "data": { "text/html": [ - "Text-Fabric: Text-Fabric API 10.2.1, hermans/works/app v3, Search Reference
Data: WORKS, Character table, Feature docs
Features:
\n", - "
W.F. Hermans - Volledige Werken\n", - "
\n", - "\n", - "
\n", - "
\n", - "acro\n", - "
\n", - "
str
\n", - "\n", - " acronym of a work\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "anchor_id\n", - "
\n", - "
str
\n", - "\n", - " id of an anchor element\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "author\n", - "
\n", - "
str
\n", - "\n", - " author of a book (titleRef elements)\n", + "data: ~/github/CLARIAH/descartes-tf/parallels/tf/1.0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is Text-Fabric 11.0.7\n", + "Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html\n", + "\n", + "28 features found and 0 ignored\n", + " 0.09s Dataset without structure sections in otext:no structure functions in the T-API\n", + " 0.34s All features loaded/computed - for details use TF.isLoaded()\n", + " 0.01s All additional features loaded - for details use TF.isLoaded()\n" + ] + }, + { + "data": { + "text/html": [ "\n", - "
\n", + " Text-Fabric: Text-Fabric API 11.0.7, CLARIAH/descartes-tf/app v3, Search Reference
\n", + " Data: DESCARTES-TF, Character table, Feature docs
\n", + "
Node types\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
Name# of nodes# slots/node% coverage
volume885241.88100
letter725940.60100
page2884236.45100
postscriptum5646.790
opener5451.970
closer54113.101
address8615.220
head72523.372
p843880.82100
sentence1433245.7496
hi59724.634
formula62001.211
figure3191.000
word6819351.00100
\n", + " Sets: no custom sets
\n", + " Features:
\n", + "
Similar Sentences\n", + "
\n", "\n", "
\n", - "
\n", - "cat\n", + "
\n", + "sim\n", "
\n", - "
str
\n", - "\n", - " cat of a book (titleRef elements)\n", + "
int
\n", "\n", - "
\n", + " similarity between sentences based on the Levenshtein ratio\n", "\n", - "
\n", - "
\n", - "cause\n", "
\n", - "
str
\n", "\n", - " nature of a variant (lem or rdg)\n", + "
\n", + "
\n", "\n", - "
\n", + "
Descartes = Descartes, all letters\n", + "
\n", "\n", "
\n", "
\n", - "code\n", + "alt_date\n", "
\n", "
str
\n", "\n", - " kind of something, e.g. a name\n", + " alternative date of a letter\n", "\n", "
\n", "\n", "
\n", "
\n", - "completed\n", + "alt_id\n", "
\n", "
str
\n", "\n", - " completion date of the digitization of a work\n", + " alternative ids of a letter, comma separated\n", "\n", "
\n", "\n", "
\n", "
\n", - "contributors\n", + "cert\n", "
\n", "
str
\n", "\n", - " contributors of a work\n", + " certainty of something\n", "\n", "
\n", "\n", "
\n", "
\n", - "ed\n", + "date\n", "
\n", "
str
\n", "\n", - " editor of something (lem or pb)\n", + " date of a letter\n", "\n", "
\n", "\n", "
\n", "
\n", - "edRat\n", + "id\n", "
\n", "
str
\n", "\n", - " editor of lem\n", + " id of a letter\n", "\n", "
\n", "\n", "
\n", "
\n", - "editor\n", + "intermediary\n", "
\n", "
str
\n", "\n", - " editor of a work\n", + " person involved in the transmission of the letter from sender to receiver\n", "\n", "
\n", "\n", "
\n", "
\n", - "id\n", + "isitalic\n", "
\n", "
str
\n", "\n", - " id of an element\n", + " whether the word is in italic\n", "\n", "
\n", "\n", "
\n", "
\n", - "iscursief\n", + "ismargin\n", "
\n", "
str
\n", "\n", - " whether the word is in italic\n", + " whether the word is in the margin\n", "\n", "
\n", "\n", "
\n", "
\n", - "iskap\n", + "issub\n", "
\n", "
str
\n", "\n", - " whether the word is in uppercase\n", + " whether the word is in subscript\n", "\n", "
\n", "\n", "
\n", "
\n", - "issc\n", + "issup\n", "
\n", "
str
\n", "\n", - " whether the word is in small caps\n", + " whether the word is in supscript\n", "\n", "
\n", "\n", "
\n", "
\n", - "isspat\n", + "language\n", "
\n", "
str
\n", "\n", - " whether the word is differently spaced\n", + " language of a letter\n", "\n", "
\n", "\n", "
\n", "
\n", - "isvet\n", + "level\n", "
\n", "
str
\n", "\n", - " whether the word is in bold\n", + " level of a paragraph when it acts like a heading\n", "\n", "
\n", "\n", "
\n", "
\n", - "n\n", + "n\n", "
\n", "
int
\n", "\n", @@ -242,47 +345,17 @@ "\n", "
\n", "
\n", - "next\n", - "
\n", - "
str
\n", - "\n", - " id of next analogous element\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "note\n", - "
\n", - "
str
\n", - "\n", - " text of a note to a word\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "nstr\n", - "
\n", - "
str
\n", - "\n", - " number of whatever element if the value is not numeric\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "orig\n", + "notation\n", "
\n", "
str
\n", "\n", - " original title of a book (titleRef elements)\n", + " notation method of a formula\n", "\n", "
\n", "\n", "
\n", "
\n", - "otype\n", + "otype\n", "
\n", "
str
\n", "\n", @@ -292,5611 +365,182 @@ "\n", "
\n", "
\n", - "prev\n", - "
\n", - "
str
\n", - "\n", - " id of previous analogous element\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "project\n", - "
\n", - "
str
\n", - "\n", - " project in which a work has been digitized\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word outside apparatus and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in lem and not in any rdg; for words in rdgs it is the empty string, and it is undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_151\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness 151 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_249\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness 249 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_bhd1\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness bhd1 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_bhd2\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness bhd2 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_bhd20\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness bhd20 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_bhd25\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness bhd25 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_bhd26\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness bhd26 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_bhd5\n", + "punc\n", "
\n", "
str
\n", "\n", - " nonword chars after a word in an rdg for witness bhd5 and undefined elsewhere\n", + " nonword chars after a word \n", "\n", "
\n", "\n", "
\n", "
\n", - "punc_bhd7\n", + "recipient\n", "
\n", "
str
\n", "\n", - " nonword chars after a word in an rdg for witness bhd7 and undefined elsewhere\n", + " recipient of a letter\n", "\n", "
\n", "\n", "
\n", "
\n", - "punc_bt\n", + "recipientloc\n", "
\n", "
str
\n", "\n", - " nonword chars after a word in an rdg for witness bt and undefined elsewhere\n", + " location from where a letter was received\n", "\n", "
\n", "\n", "
\n", "
\n", - "punc_d1\n", + "resp\n", "
\n", "
str
\n", "\n", - " nonword chars after a word in an rdg for witness d1 and undefined elsewhere\n", + " person responsible for something\n", "\n", "
\n", "\n", "
\n", "
\n", - "punc_d10\n", + "sender\n", "
\n", "
str
\n", "\n", - " nonword chars after a word in an rdg for witness d10 and undefined elsewhere\n", + " sender of a letter\n", "\n", "
\n", "\n", "
\n", "
\n", - "punc_d11\n", + "senderloc\n", "
\n", "
str
\n", "\n", - " nonword chars after a word in an rdg for witness d11 and undefined elsewhere\n", + " location from where a letter was sent\n", "\n", "
\n", "\n", "
\n", "
\n", - "punc_d12\n", + "tex\n", "
\n", "
str
\n", "\n", - " nonword chars after a word in an rdg for witness d12 and undefined elsewhere\n", + " unformatted TeX code of a formula, without the `$`\n", "\n", "
\n", "\n", "
\n", "
\n", - "punc_d14\n", + "trans\n", "
\n", "
str
\n", "\n", - " nonword chars after a word in an rdg for witness d14 and undefined elsewhere\n", + " transcription of a word \n", "\n", "
\n", "\n", "
\n", "
\n", - "punc_d15\n", + "typ\n", "
\n", "
str
\n", "\n", - " nonword chars after a word in an rdg for witness d15 and undefined elsewhere\n", + " kind of a node; \"empty\"; \"formula\", \"head\", \"symbol\", \"illustration\"\n", "\n", "
\n", "\n", "
\n", "
\n", - "punc_d16\n", + "url\n", "
\n", "
str
\n", "\n", - " nonword chars after a word in an rdg for witness d16 and undefined elsewhere\n", + " url of a graphic node\n", "\n", "
\n", "\n", "
\n", - "
\n", - "punc_d2\n", + "
\n", + "oslots\n", "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness d2 and undefined elsewhere\n", + "
none
\n", "\n", - "
\n", + " \n", "\n", - "
\n", - "
\n", - "punc_d22\n", "
\n", - "
str
\n", "\n", - " nonword chars after a word in an rdg for witness d22 and undefined elsewhere\n", - "\n", - "
\n", + "
\n", + "
\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
Text-Fabric API: names N F E L T S C TF directly usable

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "A = use(\n", - " \"hermans/works:clone\",\n", - " checkout=\"clone\",\n", - " backend=\"gitlab.huc.knaw.nl\",\n", - " hoist=globals(),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "# Parallels\n", - "\n", - "We make edges between similar sentences.\n", - "\n", - "When are sentences similar?\n", - "\n", - "If a certain distance metric is above a certain threshold.\n", - "\n", - "We choose this metric:\n", - "\n", - "* we reduce a sentence to the set of words in it, excluding punctuation.\n", - "* the similarity between two sentences is the size of the intersection divided by the size of the union of their sets times 100." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Preparation\n", - "\n", - "We pre-compute all sets for all sentences in the base text.\n", - "But we weed out the sentences that do not start with a capital letter." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def makeSet(sentence):\n", - " sentenceSet = set()\n", - " for (i, w) in enumerate(L.d(sentence, otype=\"word\")):\n", - " text = F.transb.v(w)\n", - " if i == 0:\n", - " if text == \"\":\n", - " break\n", - " firstLetter = text[0]\n", - " if not firstLetter.isalpha() or firstLetter.upper() != firstLetter:\n", - " break\n", - " if text:\n", - " sentenceSet.add(text)\n", - " \n", - " return sentenceSet" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "lines_to_end_of_cell_marker": 2 - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 0.01s 13163 results\n" - ] - } - ], - "source": [ - "query = \"\"\"\n", - "sentence\n", - "/with/\n", - ".. wit#\n", - "/or/\n", - ".. wit=base\n", - "/-/\n", - "\"\"\"\n", - "results = A.search(query, shallow=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Weed out the sentences that do not start with a capital letter." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "lines_to_end_of_cell_marker": 2 - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "10741 sentences\n" - ] - } - ], - "source": [ - "sentences = {}\n", - "\n", - "for sentence in results:\n", - " sentenceSet = makeSet(sentence)\n", - " if sentenceSet:\n", - " sentences[sentence] = sentenceSet\n", - "\n", - "nSentences = len(sentences)\n", - "print(f\"{nSentences} sentences\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "lines_to_next_cell": 2 - }, - "source": [ - "# Measure" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "def sim(lSet, mSet):\n", - " return int(round(100 * len(lSet & mSet) / len(lSet | mSet)))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Compute all similarities\n", - "\n", - "We are going to perform several millions of comparisons, each of which is more than an elemetary operation.\n", - "\n", - "Let's measure time." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "THRESHOLD = 50\n", - "\n", - "\n", - "def computeSim(limit=None):\n", - " similarity = {}\n", - "\n", - " sentenceNodes = sorted(sentences.keys())\n", - " nSentences = len(sentenceNodes)\n", - "\n", - " nComparisons = nSentences * (nSentences - 1) // 2\n", - "\n", - " print(f\"{nComparisons} comparisons to make\")\n", - " chunkSize = nComparisons // 100\n", - "\n", - " co = 0\n", - " b = 0\n", - " si = 0\n", - " p = 0\n", - "\n", - " A.indent(reset=True)\n", - "\n", - " stop = False\n", - " for i in range(nSentences):\n", - " nodeI = sentenceNodes[i]\n", - " sentenceI = sentences[nodeI]\n", - " for j in range(i + 1, nSentences):\n", - " nodeJ = sentenceNodes[j]\n", - " sentenceJ = sentences[nodeJ]\n", - " s = sim(sentenceI, sentenceJ)\n", - " co += 1\n", - " b += 1\n", - " if b == chunkSize:\n", - " p += 1\n", - " A.info(f\"{p:>3}% - {co:>12} comparisons and {si:>10} similarities\")\n", - " b = 0\n", - " if limit is not None and p >= limit:\n", - " stop = True\n", - " break\n", - "\n", - " if s < THRESHOLD:\n", - " continue\n", - " similarity[(nodeI, nodeJ)] = sim(sentenceI, sentenceJ)\n", - " si += 1\n", - " if stop:\n", - " break\n", - "\n", - " A.info(f\"{p:>3}% - {co:>12} comparisons and {si:>10} similarities\")\n", - " return similarity" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We are going to run it to a few % first and do some checks then." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "57679170 comparisons to make\n", - " 0.66s 1% - 576791 comparisons and 1 similarities\n", - " 1.27s 2% - 1153582 comparisons and 1 similarities\n", - " 1.88s 3% - 1730373 comparisons and 2 similarities\n", - " 2.47s 4% - 2307164 comparisons and 2 similarities\n", - " 3.06s 5% - 2883955 comparisons and 7 similarities\n", - " 3.65s 6% - 3460746 comparisons and 7 similarities\n", - " 4.22s 7% - 4037537 comparisons and 11 similarities\n", - " 4.80s 8% - 4614328 comparisons and 12 similarities\n", - " 5.38s 9% - 5191119 comparisons and 16 similarities\n", - " 6.00s 10% - 5767910 comparisons and 19 similarities\n", - " 6.00s 10% - 5767910 comparisons and 19 similarities\n" - ] - } - ], - "source": [ - "similarity = computeSim(limit=10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We check the sanity of the results." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "50\n", - "100\n" - ] - } - ], - "source": [ - "print(min(similarity.values()) if len(similarity) else 0)\n", - "print(max(similarity.values()) if len(similarity) else 0)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "eq = [x for x in similarity.items() if x[1] >= 100]\n", - "neq = [x for x in similarity.items() if x[1] <= THRESHOLD]" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1\n", - "11\n" - ] - } - ], - "source": [ - "print(len(eq))\n", - "print(len(neq))" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "((255776, 271017), 100)\n", - "((256084, 266565), 50)\n" - ] - } - ], - "source": [ - "print(eq[0] if len(eq) else 0)\n", - "print(neq[0] if len(neq) else 0)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Looks good.\n", - "\n", - "Now the whole computation.\n", - "\n", - "But if we have done this before, and nothing has changed, we load previous results from disk.\n", - "\n", - "If we do not find previous results, we compute them and save the results to disk." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "PARA_DIR = f\"{A.tempDir}/parallels\"\n", - "\n", - "\n", - "def writeResults(data, location, name):\n", - " if not os.path.exists(location):\n", - " os.makedirs(location, exist_ok=True)\n", - " path = f\"{location}/{name}\"\n", - " with gzip.open(path, \"wb\") as f:\n", - " pickle.dump(data, f)\n", - " print(f\"Data written to {path}\")\n", - "\n", - "\n", - "def readResults(location, name):\n", - " path = f\"{location}/{name}\"\n", - " if not os.path.exists(path):\n", - " print(f\"File not found: {path}\")\n", - " return None\n", - " with gzip.open(path, \"rb\") as f:\n", - " data = pickle.load(f)\n", - " print(f\"Data read from {path}\")\n", - " return data" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "File not found: /Users/me/gitlab.huc.knaw.nl/hermans/works/_temp/parallels/sim-0.4.zip\n", - "57679170 comparisons to make\n", - " 0.68s 1% - 576791 comparisons and 1 similarities\n", - " 1.30s 2% - 1153582 comparisons and 1 similarities\n", - " 1.90s 3% - 1730373 comparisons and 2 similarities\n", - " 2.50s 4% - 2307164 comparisons and 2 similarities\n", - " 3.10s 5% - 2883955 comparisons and 7 similarities\n", - " 3.69s 6% - 3460746 comparisons and 7 similarities\n", - " 4.26s 7% - 4037537 comparisons and 11 similarities\n", - " 4.87s 8% - 4614328 comparisons and 12 similarities\n", - " 5.48s 9% - 5191119 comparisons and 16 similarities\n", - " 6.12s 10% - 5767910 comparisons and 19 similarities\n", - " 6.69s 11% - 6344701 comparisons and 19 similarities\n", - " 7.25s 12% - 6921492 comparisons and 24 similarities\n", - " 7.84s 13% - 7498283 comparisons and 24 similarities\n", - " 8.43s 14% - 8075074 comparisons and 26 similarities\n", - " 9.01s 15% - 8651865 comparisons and 27 similarities\n", - " 9.63s 16% - 9228656 comparisons and 27 similarities\n", - " 10s 17% - 9805447 comparisons and 29 similarities\n", - " 11s 18% - 10382238 comparisons and 30 similarities\n", - " 11s 19% - 10959029 comparisons and 33 similarities\n", - " 12s 20% - 11535820 comparisons and 33 similarities\n", - " 12s 21% - 12112611 comparisons and 35 similarities\n", - " 13s 22% - 12689402 comparisons and 38 similarities\n", - " 14s 23% - 13266193 comparisons and 38 similarities\n", - " 14s 24% - 13842984 comparisons and 41 similarities\n", - " 15s 25% - 14419775 comparisons and 49 similarities\n", - " 15s 26% - 14996566 comparisons and 49 similarities\n", - " 16s 27% - 15573357 comparisons and 50 similarities\n", - " 16s 28% - 16150148 comparisons and 56 similarities\n", - " 17s 29% - 16726939 comparisons and 58 similarities\n", - " 17s 30% - 17303730 comparisons and 64 similarities\n", - " 18s 31% - 17880521 comparisons and 66 similarities\n", - " 19s 32% - 18457312 comparisons and 66 similarities\n", - " 19s 33% - 19034103 comparisons and 68 similarities\n", - " 20s 34% - 19610894 comparisons and 69 similarities\n", - " 20s 35% - 20187685 comparisons and 70 similarities\n", - " 21s 36% - 20764476 comparisons and 73 similarities\n", - " 21s 37% - 21341267 comparisons and 75 similarities\n", - " 22s 38% - 21918058 comparisons and 75 similarities\n", - " 23s 39% - 22494849 comparisons and 75 similarities\n", - " 23s 40% - 23071640 comparisons and 76 similarities\n", - " 24s 41% - 23648431 comparisons and 77 similarities\n", - " 24s 42% - 24225222 comparisons and 77 similarities\n", - " 25s 43% - 24802013 comparisons and 77 similarities\n", - " 26s 44% - 25378804 comparisons and 77 similarities\n", - " 26s 45% - 25955595 comparisons and 81 similarities\n", - " 27s 46% - 26532386 comparisons and 91 similarities\n", - " 27s 47% - 27109177 comparisons and 91 similarities\n", - " 28s 48% - 27685968 comparisons and 96 similarities\n", - " 28s 49% - 28262759 comparisons and 97 similarities\n", - " 29s 50% - 28839550 comparisons and 97 similarities\n", - " 30s 51% - 29416341 comparisons and 97 similarities\n", - " 30s 52% - 29993132 comparisons and 97 similarities\n", - " 31s 53% - 30569923 comparisons and 97 similarities\n", - " 32s 54% - 31146714 comparisons and 97 similarities\n", - " 32s 55% - 31723505 comparisons and 97 similarities\n", - " 33s 56% - 32300296 comparisons and 99 similarities\n", - " 33s 57% - 32877087 comparisons and 100 similarities\n", - " 34s 58% - 33453878 comparisons and 102 similarities\n", - " 35s 59% - 34030669 comparisons and 104 similarities\n", - " 35s 60% - 34607460 comparisons and 105 similarities\n", - " 36s 61% - 35184251 comparisons and 107 similarities\n", - " 36s 62% - 35761042 comparisons and 107 similarities\n", - " 37s 63% - 36337833 comparisons and 107 similarities\n", - " 38s 64% - 36914624 comparisons and 108 similarities\n", - " 38s 65% - 37491415 comparisons and 109 similarities\n", - " 39s 66% - 38068206 comparisons and 111 similarities\n", - " 39s 67% - 38644997 comparisons and 111 similarities\n", - " 40s 68% - 39221788 comparisons and 111 similarities\n", - " 41s 69% - 39798579 comparisons and 114 similarities\n", - " 41s 70% - 40375370 comparisons and 115 similarities\n", - " 42s 71% - 40952161 comparisons and 115 similarities\n", - " 42s 72% - 41528952 comparisons and 116 similarities\n", - " 43s 73% - 42105743 comparisons and 116 similarities\n", - " 43s 74% - 42682534 comparisons and 116 similarities\n", - " 44s 75% - 43259325 comparisons and 123 similarities\n", - " 45s 76% - 43836116 comparisons and 125 similarities\n", - " 45s 77% - 44412907 comparisons and 125 similarities\n", - " 46s 78% - 44989698 comparisons and 125 similarities\n", - " 46s 79% - 45566489 comparisons and 129 similarities\n", - " 47s 80% - 46143280 comparisons and 133 similarities\n", - " 47s 81% - 46720071 comparisons and 134 similarities\n", - " 48s 82% - 47296862 comparisons and 151 similarities\n", - " 48s 83% - 47873653 comparisons and 168 similarities\n", - " 49s 84% - 48450444 comparisons and 170 similarities\n", - " 49s 85% - 49027235 comparisons and 173 similarities\n", - " 50s 86% - 49604026 comparisons and 178 similarities\n", - " 51s 87% - 50180817 comparisons and 181 similarities\n", - " 51s 88% - 50757608 comparisons and 182 similarities\n", - " 52s 89% - 51334399 comparisons and 186 similarities\n", - " 52s 90% - 51911190 comparisons and 196 similarities\n", - " 53s 91% - 52487981 comparisons and 203 similarities\n", - " 53s 92% - 53064772 comparisons and 220 similarities\n", - " 54s 93% - 53641563 comparisons and 225 similarities\n", - " 54s 94% - 54218354 comparisons and 229 similarities\n", - " 55s 95% - 54795145 comparisons and 233 similarities\n", - " 55s 96% - 55371936 comparisons and 235 similarities\n", - " 56s 97% - 55948727 comparisons and 242 similarities\n", - " 56s 98% - 56525518 comparisons and 248 similarities\n", - " 57s 99% - 57102309 comparisons and 253 similarities\n", - " 57s 100% - 57679100 comparisons and 261 similarities\n", - " 57s 100% - 57679170 comparisons and 261 similarities\n", - "Data written to /Users/me/gitlab.huc.knaw.nl/hermans/works/_temp/parallels/sim-0.4.zip\n" - ] - } - ], - "source": [ - "similarity = readResults(PARA_DIR, f\"sim-{A.version}.zip\")\n", - "if not similarity:\n", - " similarity = computeSim()\n", - " writeResults(similarity, PARA_DIR, f\"sim-{A.version}.zip\")" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "261" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(similarity)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So, not too many similarities.\n", - "\n", - "Let's find out which lines have the most correspondences." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "339 out of 10741 lines have at least one similar line\n" - ] - } - ], - "source": [ - "parallels = {}\n", - "\n", - "for (sentence, m) in similarity:\n", - " parallels.setdefault(sentence, set()).add(m)\n", - " parallels.setdefault(m, set()).add(sentence)\n", - "\n", - "print(f\"{len(parallels)} out of {nSentences} lines have at least one similar line\")" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "rankedParallels = sorted(\n", - " parallels.items(),\n", - " key=lambda x: (-len(x[1]), x[0]),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 12 siblings of nms 6:p259.0 @ 266565 ====== Ik...\n", - " nms 17:p861.0 @ 268992 = 50%= Ik gaap.\n", - " nms 20:p1036.0 @ 269794 = 50%= Ik spring.\n", - " nms 34:p1734.0 @ 272887 = 50%= Ik zeg:\n", - " paranoia 6:p59.0 @ 258630 = 50%= Ik sliep.\n", - " paranoia 6:p32.0 @ 258278 = 50%= Ik wachtte.\n", - " nms 7:p284.0 @ 266699 = 50%= Ik niet.\n", - " paranoia 2:p40.0 @ 256084 = 50%= Ik las...\n", - " nms 20:p1027.0 @ 269751 = 50%= Ik niet...\n", - " paranoia 2:p92.0 @ 256537 = 50%= Ik viel.\n", - " nms 31:p1575.0 @ 272090 = 50%= Ik kijk.\n", - " nms 34:p1738.0 @ 272891 =100%= Ik:\n", - " nms 34:p1741.0 @ 272895 =100%= Ik:\n", - "\n", - " 11 siblings of nms 34:p1736.0 @ 272889 ====== Hij:\n", - " nms 18:p965.0 @ 269409 = 50%= Hij glimlacht.\n", - " nms 5:p228.0 @ 266435 = 50%= Hij wijst.\n", - " paranoia 7:p135.0 @ 259556 = 50%= Hij lachte.\n", - " nms 5:p228.0 @ 266437 = 50%= Hij spreekt.\n", - " nms 20:p1036.0 @ 269797 = 50%= Hij bukt.\n", - " nms 9:p410.0 @ 267259 = 50%= Hij lacht.\n", - " paranoia 7:p193.1 @ 259848 = 50%= Hij spartelde.\n", - " nms 22:p1065.0 @ 269923 = 50%= Hij vraagt:\n", - " nms 29:p1429.0 @ 271559 = 50%= Hij tekent.\n", - " nms 9:p394.0 @ 267195 = 50%= Hij vertelt, vertelt.\n", - " paranoia 6:p22.0 @ 258237 = 50%= Hij lachte.\n", - "\n", - " 6 siblings of paranoia 7:p16.0 @ 258883 ====== Ik zei niets.\n", - " paranoia 6:p50.0 @ 258563 = 50%= Ik ontdekte niets.\n", - " paranoia 8:p84.0 @ 260401 = 60%= Ik zei niets over Elena.\n", - " paranoia 7:p82.0 @ 259287 = 50%= Ik hoorde niets.\n", - " nms 29:p1467.0 @ 271705 = 50%= Ik vraag niets.\n", - " paranoia 7:p174.0 @ 259707 = 50%= Hij zei niets.\n", - " paranoia 7:p174.0 @ 259708 = 75%= Ik zei ook niets.\n", - "\n", - " 5 siblings of nms 7:p286.0 @ 266702 ====== Ik kan niet slapen.\n", - " nms 17:p861.0 @ 268993 = 57%= Ik ben moe, maar kan niet slapen.\n", - " nms 17:p863.0 @ 268996 =100%= Ik kan niet slapen.\n", - " nms 7:p284.0 @ 266699 = 50%= Ik niet.\n", - " nms 36:p1900.0 @ 273804 = 50%= Ik kan hier niet blijven.\n", - " nms 20:p1027.0 @ 269751 = 50%= Ik niet...\n", - "\n", - " 5 siblings of nms 36:p1902.0 @ 273807 ====== Maar ik val niet.\n", - " nms 36:p1905.0 @ 273834 = 50%= Maar ik zie hem niet.\n", - " nms 3:p172.0 @ 266194 = 50%= Maar luisteren doe ik niet.\n", - " paranoia 9:p20.0 @ 260853 = 50%= Maar ik...\n", - " nms 27:p1316.0 @ 271031 = 50%= Maar ik ben niet dood.\n", - " nms 7:p287.0 @ 266717 = 67%= Maar ik val niet in slaap.\n", - "\n", - " 4 siblings of paranoia 2:p62.0 @ 256262 ====== Ik begreep het niet.\n", - " nms 7:p284.0 @ 266699 = 50%= Ik niet.\n", - " paranoia 8:p75.0 @ 260324 = 60%= Ik begreep het onmiddellijk.\n", - " nms 34:p1794.0 @ 273132 = 60%= Ik vind het niet.\n", - " nms 20:p1027.0 @ 269751 = 50%= Ik niet...\n", - "\n", - " 4 siblings of nms 2:p96.0 @ 265947 ====== Het is een lichtdruk.\n", - " nms 41:p2005.0 @ 274642 = 50%= Het is een kleine zender.\n", - " su 1:p339.0 @ 262715 = 50%= Het is een gril, een manie.\n", - " nms 36:p1889.0 @ 273692 = 60%= Het is een poolvos.\n", - " paranoia 9:p110.0 @ 261295 = 50%= Het is uitsluitend een karaktertrek.\n", - "\n", - " 4 siblings of nms 13:p635.0 @ 268174 ====== Hij is Arne niet.\n", - " su 3:p56.0 @ 264432 = 60%= Hij is niet royaal.\n", - " su 1:p441.0 @ 263075 = 60%= Hij is niet gestegen .\n", - " nms 40:p1981.0 @ 274412 = 50%= Arne niet.\n", - " nms 11:p582.0 @ 267942 = 50%= Arne is er niet bij.\n", - "\n", - " 3 siblings of su 3:p171.0 @ 264795 ====== De slaap kwam niet.\n", - " su 3:p177.0 @ 264801 =100%= De slaap kwam niet.\n", - " su 3:p179.0 @ 264803 =100%= De slaap kwam niet.\n", - " su 3:p175.0 @ 264799 =100%= De slaap kwam niet.\n", - "\n", - " 3 siblings of nms 23:p1114.0 @ 270133 ====== Het spijt me.\n", - " nms 31:p1582.0 @ 272131 = 75%= Het spijt me, het spijt me.\n", - " nms 38:p1947.0 @ 274205 =100%= Het spijt me.\n", - " paranoia 9:p24.0 @ 260870 = 50%= Het spijt mij...\n", - "\n", - " 2 siblings of paranoia 5:p102.0 @ 257904 ====== Cleever zei niets terug.\n", - " paranoia 5:p106.0 @ 257926 = 60%= Maar Cleever zei niets.\n", - " su 3:p158.0 @ 264783 = 50%= De onderofficier zei niets terug.\n", - "\n", - " 2 siblings of paranoia 6:p3.0 @ 258064 ====== Ik keek niet om.\n", - " nms 7:p284.0 @ 266699 = 50%= Ik niet.\n", - " nms 20:p1027.0 @ 269751 = 50%= Ik niet...\n", - "\n", - " 2 siblings of paranoia 7:p179.0 @ 259751 ====== Boven.\n", - " nms 11:p548.0 @ 267813 =100%= Boven.\n", - " nms 36:p1890.0 @ 273702 =100%= Boven.\n", - "\n", - " 2 siblings of paranoia 9:p92.0 @ 261209 ====== Of...\n", - " nms 25:p1169.0 @ 270460 = 50%= Of als...\n", - " nms 29:p1448.0 @ 271622 = 50%= Of Mikkelsen.\n", - "\n", - " 2 siblings of su 1:p381.0 @ 262909 ====== Zij lacht.\n", - " nms 11:p563.0 @ 267873 =100%= Zij lacht.\n", - " nms 42:p2113.0 @ 274884 =100%= Zij lacht.\n", - "\n", - " 2 siblings of su 3:p201.0 @ 264877 ====== Nee!\n", - " nms 38:p1937.0 @ 274096 = 50%= Spijt? Nee.\n", - " nms 30:p1524.0 @ 271918 =100%= Nee.\n", - "\n", - " 2 siblings of nms 5:p228.0 @ 266436 ====== Hij klapt de buitenste bril naar beneden.\n", - " nms 6:p252.0 @ 266536 = 56%= Hij klapt de buitenste bril weer omhoog.\n", - " nms 6:p239.0 @ 266473 = 56%= Nummedal slaat de buitenste bril naar beneden.\n", - "\n", - " 2 siblings of nms 8:p344.0 @ 266938 ====== Natuurlijk, Alfred.\n", - " su 2:p380.0 @ 264122 = 50%= Natuurlijk.\n", - " nms 35:p1867.0 @ 273567 = 50%= Alfred.\n", - "\n", - " 2 siblings of nms 8:p345.0 @ 266941 ====== Het is waar.\n", - " nms 45:p2292.0 @ 275461 = 50%= Het is gravlaks.\n", - " paranoia 5:p81.0 @ 257815 = 75%= Het is werkelijk waar.\n", - "\n", - " 2 siblings of nms 15:p761.0 @ 268700 ====== Arne zegt:\n", - " nms 25:p1175.0 @ 270475 =100%= Arne zegt:\n", - " nms 26:p1219.0 @ 270700 =100%= Arne zegt:\n", - "\n", - " 2 siblings of nms 17:p864.0 @ 269006 ====== Ik kruip uit de slaapzak en ga rechtop zitten.\n", - " nms 25:p1165.0 @ 270426 = 50%= Ik ruk de treksluitingen los en ga rechtop zitten.\n", - " nms 28:p1383.0 @ 271338 = 56%= Ik kruip uit de slaapzak.\n", - "\n", - " 2 siblings of nms 18:p968.0 @ 269435 ====== Ik blijf niet achter.\n", - " nms 7:p284.0 @ 266699 = 50%= Ik niet.\n", - " nms 20:p1027.0 @ 269751 = 50%= Ik niet...\n", - "\n", - " 2 siblings of nms 18:p972.0 @ 269449 ====== Ik kijk op mijn horloge.\n", - " nms 24:p1130.0 @ 270243 = 56%= Ik kijk op mijn horloge: het is vier uur.\n", - " nms 34:p1808.0 @ 273199 = 50%= Hoe laat is het nu? Ik kijk op mijn horloge.\n", - "\n", - " 2 siblings of nms 20:p1041.0 @ 269806 ====== Ik ben erover.\n", - " nms 36:p1888.0 @ 273688 = 50%= Ik ben boven.\n", - " nms 34:p1757.0 @ 272953 = 50%= Ik ben machteloos.\n", - "\n", - " 2 siblings of nms 25:p1170.0 @ 270466 ====== Ik kijk naar hem om.\n", - " nms 2:p101.0 @ 265963 = 67%= Ik kijk naar hem op.\n", - " nms 9:p424.0 @ 267326 = 57%= Ik draai mij naar hem om.\n", - "\n", - " 2 siblings of nms 26:p1222.0 @ 270712 ====== Ik heb het over het begin.\n", - " nms 27:p1316.0 @ 271032 = 50%= Ik heb het overleefd.\n", - " nms 27:p1319.0 @ 271051 = 50%= Ik heb het overleefd.\n", - "\n", - " 2 siblings of nms 32:p1637.0 @ 272417 ====== Verder zie ik niets.\n", - " nms 29:p1426.0 @ 271553 = 50%= Verder heb ik niets gezien.\n", - " nms 12:p597.0 @ 268002 = 50%= Verder niets.\n", - "\n", - " 2 siblings of nms 32:p1657.0 @ 272471 ====== Ik ga naast Arne zitten.\n", - " nms 44:p2167.0 @ 275066 = 60%= Ik ga zitten.\n", - " nms 20:p1041.0 @ 269813 = 50%= Ik sta naast Arne.\n", - "\n", - " 2 siblings of nms 36:p1897.0 @ 273789 ====== Ik ben niet treurig.\n", - " nms 7:p284.0 @ 266699 = 50%= Ik niet.\n", - " nms 20:p1027.0 @ 269751 = 50%= Ik niet...\n", - "\n", - " 2 siblings of nms 41:p2004.0 @ 274628 ====== Drie.\n", - " nms 19:p1015.0 @ 269680 = 50%= Drie kilometer.\n", - " paranoia 6:p18.0 @ 258210 = 50%= Drie jaar.\n", - "\n", - " 2 siblings of nms 47:p2362.0 @ 275689 ====== Zij kan het niet helpen.\n", - " paranoia 5:p10.0 @ 257353 = 50%= Ik kan het toch ook niet helpen.\n", - " nms 30:p1524.0 @ 271921 = 50%= Eenvoudiger kan het niet.\n", - "\n", - " 1 sibling of paranoia 1:p38.0 @ 255696 ====== Er is maar een werkelijk woord: chaos.\n", - " paranoia 1:p40.0 @ 255704 = 55%= Er is in onze talen maar één werkelijk woord: chaos.\n", - "\n", - " 1 sibling of paranoia 2:p5.0 @ 255776 ====== Goed.\n", - " nms 27:p1310.0 @ 271017 =100%= Goed.\n", - "\n", - " 1 sibling of paranoia 2:p41.0 @ 256093 ====== Het is twintig jaar geleden.\n", - " su 3:p449.0 @ 265518 = 57%= Het is 2176 jaar geleden gebeurd.\n", - "\n", - " 1 sibling of paranoia 2:p46.0 @ 256115 ====== Zijn gezicht had de kleur van gerookte zalm, waarop zijn blonde baardstoppels zout strooiden.\n", - " paranoia 2:p90.0 @ 256517 = 57%= Het was het hoofd van de Reyer Ansloschool , wiens gezicht de kleur had van gerookte zalm, waarop zijn baardstoppels zout strooiden.\n", - "\n", - " 1 sibling of paranoia 2:p79.0 @ 256425 ====== Zij gaf geen antwoord.\n", - " paranoia 6:p61.0 @ 258634 = 60%= Ik gaf geen antwoord.\n", - "\n", - " 1 sibling of paranoia 2:p90.0 @ 256509 ====== Maar de jongens kwamen niet tot rust.\n", - " paranoia 6:p75.0 @ 258716 = 50%= Zij kwamen de eerste uren niet tot rust.\n", - "\n", - " 1 sibling of paranoia 3:p9.0 @ 256589 ====== De deur ging gemakkelijk open.\n", - " nms 41:p2002.0 @ 274614 = 50%= De deur gaat open.\n", - "\n", - " 1 sibling of paranoia 3:p11.0 @ 256600 ====== Ook hier kwam geen licht.\n", - " paranoia 6:p48.0 @ 258535 = 50%= Ook geen licht trouwens.\n", - "\n", - " 1 sibling of paranoia 3:p20.0 @ 256629 ====== Hij sloeg zijn kraag op van angst.\n", - " paranoia 7:p146.0 @ 259586 = 56%= Hij hief zijn handen op van angst.\n", - "\n", - " 1 sibling of paranoia 3:p62.0 @ 256821 ====== Dat heb ik gedaan.\n", - " nms 27:p1277.0 @ 270894 = 50%= Dat heb ik altijd gehad.\n", - "\n", - " 1 sibling of paranoia 3:p66.0 @ 256832 ====== Cleever schudde het hoofd.\n", - " paranoia 9:p66.0 @ 261068 = 60%= Bernard schudde het hoofd.\n", - "\n", - " 1 sibling of paranoia 3:p67.0 @ 256838 ====== Zo'n leven is niet uit te houden .\n", - " nms 25:p1164.0 @ 270425 = 56%= Dit is niet uit te houden.\n", - "\n", - " 1 sibling of paranoia 3:p68.0 @ 256844 ====== Achter mijn kamer is nog een klein hokje, dat ik nu als bergplaats gebruik.\n", - " paranoia 3:p72.0 @ 256867 = 57%= Achter mijn kamer is nog een klein hokje.\n", - "\n", - " 1 sibling of paranoia 3:p68.0 @ 256847 ====== Het uitzicht is mooi.\n", - " nms 9:p426.0 @ 267330 = 60%= Het uitzicht is schitterend.\n", - "\n", - " 1 sibling of paranoia 3:p104.0 @ 257065 ====== Altijd zette juist voor zijn huis de cicerone een megafoon aan de mond en riep: Dit is de Blauwbrug.\n", - " paranoia 5:p141.0 @ 258038 = 67%= De cicerone zette zijn megafoon aan de mond en riep: Dit is de Blauwbrug.\n", - "\n", - " 1 sibling of paranoia 3:p110.0 @ 257097 ====== Hij kon niets zien.\n", - " paranoia 8:p31.0 @ 260096 = 50%= Hij kon dus weer zien.\n", - "\n", - " 1 sibling of paranoia 3:p124.0 @ 257172 ====== Het was een oud dier.\n", - " paranoia 7:p84.0 @ 259304 = 50%= Het was een aquarium .\n", - "\n", - " 1 sibling of paranoia 5:p27.0 @ 257463 ====== Waarom niet?\n", - " su 3:p221.0 @ 264922 = 67%= Waarom niet? (s.\n", - "\n", - " 1 sibling of paranoia 5:p30.0 @ 257483 ====== Ik ben helemaal niet op het distributiekantoor geweest, want ik had vergeten er heen te gaan.\n", - " paranoia 5:p81.0 @ 257809 = 50%= Ik ben helemaal niet op het distributiekantoor geweest.\n", - "\n", - " 1 sibling of paranoia 5:p49.0 @ 257599 ====== Kom mee , fluisterde hij.\n", - " paranoia 7:p162.0 @ 259656 = 50%= Kom mee!\n", - "\n", - " 1 sibling of paranoia 5:p57.0 @ 257637 ====== Daarom durfde zij ook niet meer te roepen.\n", - " paranoia 5:p90.0 @ 257859 = 67%= Arnold durfde zij niet meer te roepen.\n", - "\n", - " 1 sibling of paranoia 5:p59.0 @ 257652 ====== Arnold , zei ze zachtjes.\n", - " paranoia 5:p108.0 @ 257932 = 60%= Arnold, lieverdje , zei ze.\n", - "\n", - " 1 sibling of paranoia 5:p81.0 @ 257819 ====== Wij kunnen hier blijven .\n", - " paranoia 5:p110.0 @ 257949 = 60%= Wij blijven hier, samen.\n", - "\n", - " 1 sibling of paranoia 6:p5.0 @ 258120 ====== Ik heb dorst.\n", - " nms 25:p1172.0 @ 270471 =100%= Ik heb dorst.\n", - "\n", - " 1 sibling of paranoia 6:p16.0 @ 258204 ====== Er werd nu nergens meer geschoten.\n", - " paranoia 7:p150.0 @ 259599 = 57%= Er werd niet meer geschoten.\n", - "\n", - " 1 sibling of paranoia 6:p43.1 @ 258464 ====== Niets.\n", - " nms 34:p1815.0 @ 273232 =100%= Niets.\n", - "\n", - " 1 sibling of paranoia 7:p1.0 @ 258749 ====== Het is mijn huis.\n", - " nms 22:p1086.0 @ 269993 = 50%= Het is mijn ingeschapen puritanisme.\n", - "\n", - " 1 sibling of paranoia 7:p2.0 @ 258759 ====== Ik ging naar beneden.\n", - " nms 31:p1539.0 @ 271977 = 60%= Ik ging naar Trondheim.\n", - "\n", - " 1 sibling of paranoia 7:p18.0 @ 258900 ====== Er was niets aan te zien.\n", - " paranoia 7:p190.1 @ 259819 =100%= Er was niets aan te zien.\n", - "\n", - " 1 sibling of paranoia 7:p51.0 @ 259065 ====== Dat kan ik mij niet voorstellen!\n", - " nms 32:p1668.0 @ 272551 =100%= Dat kan ik mij niet voorstellen.\n", - "\n", - " 1 sibling of paranoia 7:p54.0 @ 259071 ====== Ik bleef zitten.\n", - " nms 44:p2167.0 @ 275066 = 50%= Ik ga zitten.\n", - "\n", - " 1 sibling of paranoia 7:p75.0 @ 259252 ====== Ik ging op het andere bed liggen en voelde aan haar wang.\n", - " paranoia 7:p108.1 @ 259433 = 50%= Ik ging bij haar zitten op het bed en voelde met mijn vingertoppen aan haar gezicht.\n", - "\n", - " 1 sibling of paranoia 7:p105.0 @ 259404 ====== Het geheime wapen.\n", - " paranoia 7:p140.0 @ 259573 =100%= Het geheime wapen.\n", - "\n", - " 1 sibling of paranoia 7:p186.0 @ 259790 ====== Maar hij zei niets.\n", - " paranoia 5:p106.0 @ 257926 = 60%= Maar Cleever zei niets.\n", - "\n", - " 1 sibling of paranoia 7:p196.0 @ 259885 ====== Ik hield het voor mogelijk, heel goed mogelijk.\n", - " paranoia 8:p69.0 @ 260289 = 50%= Ik weet het heel goed.\n", - "\n", - " 1 sibling of paranoia 8:p52.0 @ 260217 ====== Het was glas.\n", - " paranoia 8:p63.0 @ 260274 = 50%= Het was leeg.\n", - "\n", - " 1 sibling of paranoia 8:p61b.0 @ 260257 ====== Glas...\n", - " paranoia 8:p73.0 @ 260303 =100%= Glas...\n", - "\n", - " 1 sibling of paranoia 8:p79.0 @ 260366 ====== Ik wachtte twintig minuten.\n", - " paranoia 6:p32.0 @ 258278 = 50%= Ik wachtte.\n", - "\n", - " 1 sibling of paranoia 9:p17.0 @ 260784 ====== Het spijt me dat ik er niet was om je te ontvangen.\n", - " paranoia 9:p20.0 @ 260852 = 85%= Het spijt mij dat ik er niet was om je te ontvangen.\n", - "\n", - " 1 sibling of paranoia 9:p17.0 @ 260790 ====== Bernard, maak contact met mij...\n", - " paranoia 9:p19.0 @ 260806 =100%= Bernard maak contact met mij.\n", - "\n", - " 1 sibling of paranoia 9:p19.0 @ 260810 ====== Geef hem je adres, Bernard.\n", - " paranoia 9:p24.0 @ 260866 =100%= Geef hem je adres Bernard.\n", - "\n", - " 1 sibling of paranoia 9:p19.0 @ 260811 ====== Vertrouw op mij, je broer Gerard.\n", - " paranoia 9:p24.0 @ 260867 =100%= Vertrouw op mij, je broer Gerard.\n", - "\n", - " 1 sibling of paranoia 9:p35.0 @ 260948 ====== Vraag je mij, ik zie in zijn lange haren eerder een soort uitdaging.\n", - " su 3:p279.0 @ 265069 =100%= Vraag je mij, ik zie in zijn lange haren eerder een soort uitdaging.\n", - "\n", - " 1 sibling of paranoia 9:p35.0 @ 260949 ====== Juist die lange haren maken het bijna onvoorstelbaar dat Absalom het woord kapsalon niet heeft gekend.\n", - " su 3:p279.0 @ 265070 =100%= Juist die lange haren maken het bijna onvoorstelbaar dat Absalom het woord kapsalon niet heeft gekend.\n", - "\n", - " 1 sibling of paranoia 9:p41.0 @ 260966 ====== En toch, íets ervan is tot hem doorgedrongen.\n", - " su 3:p281.0 @ 265073 =100%= En toch, íets ervan is tot hem doorgedrongen.\n", - "\n", - " 1 sibling of paranoia 9:p41.0 @ 260967 ====== Daarom heeft hij zijn haar laten groeien ! Hij wist dat zijn lot samenhing met zijn haar! Zijn naam had het hem toegefluisterd, maar zo onduidelijk dat hij het niet begrepen heeft , omdat immers het woord kapsalon nog duizenden jaren vóór hem lag.\n", - " su 3:p281.0 @ 265074 = 94%= Daarom heeft hij zijn haar laten groeien! Hij wist dat zijn lot samenhing met zijn haar! Zijn naam had het hem toegefluisterd, maar zo onduidelijk dat hij het niet begrepen heeft, omdat immers het woord kapsalon nog duizenden jaren voor hem lag.\n", - "\n", - " 1 sibling of paranoia 9:p41.0 @ 260972 ====== Zijn geest heeft het niet kunnen inhalen en hij is te gronde gegaan aan een lot dat hij had kunnen voorkomen.\n", - " su 3:p281.0 @ 265075 =100%= Zijn geest heeft het niet kunnen inhalen en hij is te gronde gegaan aan een lot dat hij had kunnen voorkomen.\n", - "\n", - " 1 sibling of paranoia 9:p41.0 @ 260973 ====== Wij Nederlanders kunnen Absalom's geschiedenis niet anders zien dan in dit licht.\n", - " su 3:p281.0 @ 265076 =100%= Wij Nederlanders kunnen Absalom 's geschiedenis niet anders zien dan in dit licht .\n", - "\n", - " 1 sibling of paranoia 9:p114.0 @ 261318 ====== Ik kan doen wat ik wil.\n", - " nms 35:p1848.0 @ 273391 =100%= Ik kan doen wat ik wil.\n", - "\n", - " 1 sibling of paranoia 9:p129.0 @ 261388 ====== Maar iets...\n", - " nms 38:p1958.0 @ 274250 = 50%= Maar wel iets anders.\n", - "\n", - " 1 sibling of su 1:p266.0 @ 262484 ====== Op de zijgevel staat geschilderd Hotel Multatuli .\n", - " su 1:p280.0 @ 262530 = 50%= Op de achterkant staat gedrukt: Hotel Multatuli , Inh.\n", - "\n", - " 1 sibling of su 1:p273.0 @ 262507 ====== Dan niet.\n", - " nms 25:p1168.0 @ 270445 = 67%= Dan maar niet.\n", - "\n", - " 1 sibling of su 1:p281.0 @ 262536 ====== Ik vergeet naar de datum te kijken.\n", - " nms 47:p2343.0 @ 275631 = 50%= Ik was vergeten naar de bodem te kijken.\n", - "\n", - " 1 sibling of su 1:p405.0 @ 262970 ====== Inderdaad.\n", - " su 3:p23.0 @ 264281 =100%= Inderdaad.\n", - "\n", - " 1 sibling of su 2:p124.0 @ 263537 ====== Nee, reclamemaken doet hij niet.\n", - " nms 42:p2041.0 @ 274736 = 50%= Nee, hij ook niet.\n", - "\n", - " 1 sibling of su 2:p153.0 @ 263605 ====== Hij is een vakman.\n", - " su 2:p360.0 @ 264074 = 50%= Hij is een geestelijke emigrant.\n", - "\n", - " 1 sibling of su 2:p226.0 @ 263748 ====== Grote vraag...\n", - " nms 19:p991.0 @ 269560 = 50%= Grote.\n", - "\n", - " 1 sibling of su 2:p235.0 @ 263788 ====== Dr.\n", - " su 3:p464.0 @ 265555 =100%= Dr.\n", - "\n", - " 1 sibling of su 2:p343.0 @ 264033 ====== Enz.\n", - " su 3:p238.0 @ 264964 =100%= Enz.\n", - "\n", - " 1 sibling of su 2:p362.0 @ 264076 ====== De eerste soort wil zich, zichzelf , rechtvaardigen als mens.\n", - " su 2:p363.0 @ 264077 = 55%= De tweede soort wil zich rechtvaardigen als schrijver.\n", - "\n", - " 1 sibling of su 2:p392.0 @ 264141 ====== Het zijn b.\n", - " nms 30:p1528.0 @ 271956 = 50%= Het zijn meteorieten.\n", - "\n", - " 1 sibling of su 3:p174.0 @ 264798 ====== Hij ging op z 'n buik liggen.\n", - " su 3:p178.0 @ 264802 = 67%= Hij ging op z 'n andere zij liggen.\n", - "\n", - " 1 sibling of nms 1:p41.0 @ 265738 ====== Neemt u een stoel.\n", - " nms 44:p2163.0 @ 275059 =100%= Neemt u een stoel.\n", - "\n", - " 1 sibling of nms 1:p45.0 @ 265748 ====== Ja.\n", - " nms 13:p671.0 @ 268353 =100%= Ja.\n", - "\n", - " 1 sibling of nms 1:p53.0 @ 265786 ====== Rauwe zalm, die eerst begraven wordt en later weer opgegraven.\n", - " nms 45:p2294.0 @ 275473 = 53%= Rauwe zalm, die eerst een tijdlang ergens begraven wordt en dan, ik weet niet hoeveel tijd later, weer opgegraven.\n", - "\n", - " 1 sibling of nms 1:p55.0 @ 265796 ====== Stilte.\n", - " nms 22:p1081.0 @ 269980 =100%= Stilte.\n", - "\n", - " 1 sibling of nms 2:p96.0 @ 265943 ====== Ik doe een stap naar hem toe.\n", - " nms 44:p2161.0 @ 275050 = 50%= Ik strompel naar hem toe.\n", - "\n", - " 1 sibling of nms 5:p228.0 @ 266434 ====== Hij wipt de buitenste bril omhoog.\n", - " nms 6:p252.0 @ 266536 = 62%= Hij klapt de buitenste bril weer omhoog.\n", - "\n", - " 1 sibling of nms 6:p236.0 @ 266462 ====== Nummedal niet.\n", - " nms 44:p2184.0 @ 275128 = 50%= Ook Nummedal lacht niet.\n", - "\n", - " 1 sibling of nms 6:p250.0 @ 266526 ====== Nummedal steekt zijn hand uit.\n", - " nms 44:p2188.0 @ 275137 = 50%= Nummedal steekt zijn hand uit, maar grijpt naast het boek.\n", - "\n", - " 1 sibling of nms 6:p265.0 @ 266594 ====== Alfred I.\n", - " nms 35:p1867.0 @ 273567 = 50%= Alfred.\n", - "\n", - " 1 sibling of nms 7:p290.0 @ 266731 ====== Ik klap het open en bekijk mijn gezicht in het spiegeltje.\n", - " nms 27:p1302.0 @ 270987 = 50%= Mijn kostbare kompas! Ik klap het open en bekijk mijn voorhoofd.\n", - "\n", - " 1 sibling of nms 8:p345.0 @ 266949 ====== Nu moet ik wel.\n", - " nms 11:p543.0 @ 267780 = 50%= Nu wel.\n", - "\n", - " 1 sibling of nms 9:p363.0 @ 267062 ====== Maar ik heb haast.\n", - " paranoia 9:p20.0 @ 260853 = 50%= Maar ik...\n", - "\n", - " 1 sibling of nms 9:p466.0 @ 267510 ====== Sommige mensen weten de gewoonste dingen niet.\n", - " nms 17:p900.0 @ 269154 = 56%= De eenvoudigste dingen weten de mensen niet.\n", - "\n", - " 1 sibling of nms 10:p489.0 @ 267605 ====== De zin luidt: Does Alfred go to the races ? No, he doesn't.\n", - " nms 31:p1605.0 @ 272226 = 67%= Does Alfred go to the races today? No, he doesn't.\n", - "\n", - " 1 sibling of nms 11:p534.0 @ 267744 ====== Grond? Sneeuw.\n", - " nms 38:p1943.0 @ 274163 = 50%= Sneeuw.\n", - "\n", - " 1 sibling of nms 11:p540.0 @ 267768 ====== Moet haast wel.\n", - " nms 29:p1411.0 @ 271469 = 67%= Moet wel.\n", - "\n", - " 1 sibling of nms 11:p562.0 @ 267871 ====== Over bed gesproken.\n", - " nms 30:p1493.0 @ 271826 = 50%= Over pijn gesproken.\n", - "\n", - " 1 sibling of nms 13:p642.0 @ 268218 ====== Ik ben blijven staan.\n", - " nms 45:p2303.0 @ 275496 = 60%= Ik ben gaan staan.\n", - "\n", - " 1 sibling of nms 15:p763.0 @ 268705 ====== Arne hurkt.\n", - " nms 19:p1010.0 @ 269656 = 50%= Arne hurkt naast mij.\n", - "\n", - " 1 sibling of nms 15:p782.0 @ 268772 ====== Ik sta op, lach en schud van nee.\n", - " nms 17:p899.0 @ 269149 = 50%= Ik schud van nee.\n", - "\n", - " 1 sibling of nms 16:p814.0 @ 268855 ====== Jij?\n", - " nms 42:p2139.0 @ 274944 =100%= Jij?\n", - "\n", - " 1 sibling of nms 16:p847.0 @ 268953 ====== Wist je dat?\n", - " nms 26:p1246.0 @ 270812 =100%= Wist je dat?\n", - "\n", - " 1 sibling of nms 17:p886.0 @ 269107 ====== De groene tent van Qvigstad is nu open.\n", - " nms 32:p1637.0 @ 272418 = 55%= De groene tent van Mikkelsen en Qvigstad is verdwenen.\n", - "\n", - " 1 sibling of nms 17:p927.0 @ 269250 ====== Ik dus ook.\n", - " nms 17:p927.0 @ 269265 = 50%= Ik ook maar.\n", - "\n", - " 1 sibling of nms 18:p951.0 @ 269323 ====== Mikkelsen en ik schieten in de lach.\n", - " nms 26:p1247.0 @ 270813 = 75%= Arne en ik schieten in de lach.\n", - "\n", - " 1 sibling of nms 18:p951.0 @ 269324 ====== Arne staat op, kijkt door zijn knuisten en zegt:\n", - " nms 34:p1763.0 @ 272966 = 55%= Arne kijkt op zijn eigen kaart en staat op.\n", - "\n", - " 1 sibling of nms 19:p990.0 @ 269558 ====== Ik trek de kaart eruit en bekijk de route nauwkeurig met mijn vergrootglas.\n", - " nms 36:p1908.0 @ 273850 = 57%= Ik ga zitten en bekijk mijn kaart nauwkeurig met mijn vergrootglas.\n", - "\n", - " 1 sibling of nms 19:p992.0 @ 269565 ====== Qvigstad drinkt eruit en geeft de fles aan Arne.\n", - " nms 19:p992.0 @ 269566 = 70%= Arne drinkt en geeft de fles aan mij.\n", - "\n", - " 1 sibling of nms 19:p1005.0 @ 269637 ====== Mijn horloge staat op vijf over half tien.\n", - " nms 34:p1808.0 @ 273200 = 50%= Het staat op tien over half zes.\n", - "\n", - " 1 sibling of nms 20:p1027.0 @ 269749 ====== Ik heb wel wat anders te doen.\n", - " nms 26:p1196.0 @ 270578 = 50%= Ik heb niets te doen.\n", - "\n", - " 1 sibling of nms 20:p1028.0 @ 269757 ====== Hij wijst, hij roept.\n", - " nms 5:p228.0 @ 266435 = 50%= Hij wijst.\n", - "\n", - " 1 sibling of nms 20:p1029.0 @ 269762 ====== Ik blijf staan.\n", - " nms 39:p1977.0 @ 274391 = 50%= Ik blijf lopen.\n", - "\n", - " 1 sibling of nms 20:p1035.0 @ 269786 ====== Hier blijven staan kan ook niet.\n", - " nms 20:p1035.0 @ 269789 = 50%= Ik kan niet blijven staan en ik kan ook niet weglopen.\n", - "\n", - " 1 sibling of nms 22:p1074.0 @ 269951 ====== Ik kom naasthem staan, leun.\n", - " nms 27:p1264.0 @ 270864 = 50%= Ik neem een aanloop en kom naast hem staan.\n", - "\n", - " 1 sibling of nms 23:p1099.0 @ 270065 ====== Vier kilometer.\n", - " nms 35:p1841.0 @ 273355 = 50%= Vier kilometer, hemelsbreed gemeten.\n", - "\n", - " 1 sibling of nms 25:p1152.0 @ 270357 ====== Arne snurkt.\n", - " nms 28:p1380.0 @ 271306 =100%= Arne snurkt.\n", - "\n", - " 1 sibling of nms 26:p1197.0 @ 270589 ====== Ik heb geen oponthoud veroorzaakt.\n", - " nms 39:p1977.0 @ 274389 = 50%= Ik heb geen honger.\n", - "\n", - " 1 sibling of nms 29:p1438.0 @ 271592 ====== Hij werkt verder.\n", - " nms 32:p1688.0 @ 272592 = 50%= Hij loopt verder.\n", - "\n", - " 1 sibling of nms 30:p1525.0 @ 271933 ====== Ik wil hem...\n", - " nms 40:p1982.0 @ 274422 = 50%= Ik wil slapen.\n", - "\n", - " 1 sibling of nms 31:p1607.0 @ 272234 ====== Sibbelee zal mij niet begrijpen.\n", - " nms 31:p1607.0 @ 272235 = 50%= Niemand zal mij begrijpen.\n", - "\n", - " 1 sibling of nms 31:p1616.0 @ 272257 ====== Van nieuwe pap koken kan geen sprake zijn.\n", - " nms 34:p1755.0 @ 272944 = 56%= Van schrijven kan geen sprake zijn.\n", - "\n", - " 1 sibling of nms 32:p1668.0 @ 272554 ====== Ik moet wel diep geslapen hebben.\n", - " nms 37:p1913.0 @ 273898 = 57%= Ik moet werkelijk hebben geslapen.\n", - "\n", - " 1 sibling of nms 33:p1703.0 @ 272653 ====== Ik strijkeen derde lucifer af.\n", - " nms 40:p1987.0 @ 274470 = 50%= Ik strijk de eerste lucifer af.\n", - "\n", - " 1 sibling of nms 33:p1709.0 @ 272689 ====== Een dier.\n", - " nms 36:p1889.0 @ 273691 =100%= Een dier.\n", - "\n", - " 1 sibling of nms 35:p1848.0 @ 273392 ====== Pissen waar ik wil.\n", - " nms 35:p1848.0 @ 273393 = 60%= Poepen waar ik wil.\n", - "\n", - " 1 sibling of nms 38:p1961.0 @ 274290 ====== Heel simpel.\n", - " nms 45:p2283.0 @ 275432 =100%= Heel simpel.\n", - "\n", - " 1 sibling of nms 40:p1986.0 @ 274454 ====== Nog een forel.\n", - " nms 40:p1986.0 @ 274458 = 50%= Nog een vis.\n", - "\n", - " 1 sibling of nms 42:p2060.0 @ 274780 ====== Waarom loopt u zo ongelukkig?\n", - " nms 44:p2163.0 @ 275060 = 67%= Waarom loopt u zo onregelmatig?\n", - "\n", - " 1 sibling of nms 44:p2169.0 @ 275072 ====== Eindelijk zeg ik:\n", - " nms 44:p2188.0 @ 275139 = 50%= Daarna zeg ik:\n", - "\n", - " 1 sibling of nms 44:p2195.0 @ 275170 ====== DirektørHvalbiff.\n", - " nms 44:p2201.0 @ 275192 = 50%= Hvalbiff.\n", - "\n", - " 1 sibling of nms 45:p2272.0 @ 275393 ====== Wilma zegt:\n", - " nms 45:p2277.0 @ 275407 =100%= Wilma zegt:\n", - "\n" - ] - } - ], - "source": [ - "seen = set()\n", - "\n", - "\n", - "def getPos(node):\n", - " sec = A.sectionStrFromNode(node)\n", - " return f\"{sec:<15} @ {node:>5}\"\n", - "\n", - "\n", - "for (sentence, paras) in rankedParallels:\n", - " if sentence in seen:\n", - " continue\n", - " plural = \" \" if len(paras) == 1 else \"s\"\n", - " prefix = f\"{len(paras):>4} sibling{plural} of \"\n", - " blank = \" \" * len(prefix)\n", - " print(f\"{prefix}{getPos(sentence)} ====== {T.text(sentence).strip()}\")\n", - " for para in paras:\n", - " sim = similarity[(sentence, para)] if (sentence, para) in similarity else similarity[(para, sentence)]\n", - " print(f\"{blank}{getPos(para)} ={sim:>3}%= {T.text(para).strip()}\")\n", - " seen.add(para)\n", - " print(\"\")\n", - " seen.add(sentence)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And how many lines have just one correspondence?\n", - "\n", - "We look at the tail of rankedParallels." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Why not make an overview of exactly how wide-spread parallel lines are?\n", - "\n", - "We count how many lines have how many parallels." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Add parallels to the TF dataset\n", - "\n", - "We can add this information to the Oldbabylonian dataset as an *edge feature*.\n", - "\n", - "An edge feature links two nodes and may annotate that link with a value.\n", - "\n", - "For parallels, we link each line to each of its parallel lines and we annotate that link with the similarity between\n", - "the two lines. The similarity is a percentage, and we round it to integer values.\n", - "\n", - "If *n1* is similar to *n2*, then *n2* is similar to *n1*.\n", - "In order to save space, we only add such links once.\n", - "\n", - "We can then use\n", - "[`E.sim.b(node)`](https://annotation.github.io/text-fabric/Api/Features/#edge-features)\n", - "to find all nodes that are parallel to node.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "from tfFromTei import SETTINGS\n", - "\n", - "metaData = {\n", - " \"\": SETTINGS[\"generic\"],\n", - " \"sim\": {\n", - " \"valueType\": \"int\",\n", - " \"edgeValues\": True,\n", - " \"description\": (\n", - " \"similarity between sentences \"\n", - " \" as a percentage of the common material wrt. the combined material\"\n", - " ),\n", - " },\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "simData = {}\n", - "for ((f, t), d) in similarity.items():\n", - " simData.setdefault(f, {})[t] = d" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "backendBase = os.path.expanduser(f\"~/{A.backend}\")\n", - "mod = \"parallels\"\n", - "path = f\"{A.context.org}/{A.context.repo}/{mod}/tf\"\n", - "location = f\"{backendBase}/{path}\"\n", - "module = A.version" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 0.00s Exporting 0 node and 1 edge and 0 config features to ~/gitlab.huc.knaw.nl/hermans/works/parallels/tf/0.4:\n", - " | 0.00s T sim to ~/gitlab.huc.knaw.nl/hermans/works/parallels/tf/0.4\n", - " 0.00s Exported 0 node features and 1 edge features and 0 config features to ~/gitlab.huc.knaw.nl/hermans/works/parallels/tf/0.4\n" - ] - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "TF.save(\n", - " edgeFeatures=dict(sim=simData), metaData=metaData, location=location, module=module, silent=\"auto\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Use the parallels module\n", - "\n", - "We load the Oldbabylonian corpus again, but now with the parallels module." - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "TF-app: ~/gitlab.huc.knaw.nl/hermans/works/app" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "data: ~/gitlab.huc.knaw.nl/hermans/works/tf/0.4" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "data: ~/gitlab.huc.knaw.nl/hermans/works/parallels/tf/0.4" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Text-Fabric: Text-Fabric API 10.2.1, hermans/works/app v3, Search Reference
Data: WORKS, Character table, Feature docs
Features:
\n", - "
hermans/works/parallels/tf\n", - "
\n", - "\n", - "
\n", - "
\n", - "sim\n", - "
\n", - "
int
\n", - "\n", - " similarity between sentences as a percentage of the common material wrt. the combined material\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "\n", - "
W.F. Hermans - Volledige Werken\n", - "
\n", - "\n", - "
\n", - "
\n", - "acro\n", - "
\n", - "
str
\n", - "\n", - " acronym of a work\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "anchor_id\n", - "
\n", - "
str
\n", - "\n", - " id of an anchor element\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "author\n", - "
\n", - "
str
\n", - "\n", - " author of a book (titleRef elements)\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "cat\n", - "
\n", - "
str
\n", - "\n", - " cat of a book (titleRef elements)\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "cause\n", - "
\n", - "
str
\n", - "\n", - " nature of a variant (lem or rdg)\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "code\n", - "
\n", - "
str
\n", - "\n", - " kind of something, e.g. a name\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "completed\n", - "
\n", - "
str
\n", - "\n", - " completion date of the digitization of a work\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "contributors\n", - "
\n", - "
str
\n", - "\n", - " contributors of a work\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "ed\n", - "
\n", - "
str
\n", - "\n", - " editor of something (lem or pb)\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "edRat\n", - "
\n", - "
str
\n", - "\n", - " editor of lem\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "editor\n", - "
\n", - "
str
\n", - "\n", - " editor of a work\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "id\n", - "
\n", - "
str
\n", - "\n", - " id of an element\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "iscursief\n", - "
\n", - "
str
\n", - "\n", - " whether the word is in italic\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "iskap\n", - "
\n", - "
str
\n", - "\n", - " whether the word is in uppercase\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "issc\n", - "
\n", - "
str
\n", - "\n", - " whether the word is in small caps\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "isspat\n", - "
\n", - "
str
\n", - "\n", - " whether the word is differently spaced\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "isvet\n", - "
\n", - "
str
\n", - "\n", - " whether the word is in bold\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "n\n", - "
\n", - "
int
\n", - "\n", - " number of whatever element\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "next\n", - "
\n", - "
str
\n", - "\n", - " id of next analogous element\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "note\n", - "
\n", - "
str
\n", - "\n", - " text of a note to a word\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "nstr\n", - "
\n", - "
str
\n", - "\n", - " number of whatever element if the value is not numeric\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "orig\n", - "
\n", - "
str
\n", - "\n", - " original title of a book (titleRef elements)\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "otype\n", - "
\n", - "
str
\n", - "\n", - " \n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "prev\n", - "
\n", - "
str
\n", - "\n", - " id of previous analogous element\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "project\n", - "
\n", - "
str
\n", - "\n", - " project in which a work has been digitized\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word outside apparatus and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in lem and not in any rdg; for words in rdgs it is the empty string, and it is undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_151\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness 151 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_249\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness 249 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_bhd1\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness bhd1 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_bhd2\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness bhd2 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_bhd20\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness bhd20 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_bhd25\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness bhd25 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_bhd26\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness bhd26 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_bhd5\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness bhd5 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_bhd7\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness bhd7 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_bt\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness bt and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_d1\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness d1 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_d10\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness d10 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_d11\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness d11 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_d12\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness d12 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_d14\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness d14 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_d15\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness d15 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_d16\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness d16 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_d2\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness d2 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_d22\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness d22 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_d23\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness d23 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_d24\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness d24 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_d3\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness d3 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_d4\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness d4 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_d5\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness d5 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_d6\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness d6 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_d7\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness d7 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_dj\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness dj and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_dj-1\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness dj-1 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_dj-2\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness dj-2 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_dj-3\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness dj-3 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_dj138\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness dj138 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_dj151\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness dj151 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_dj196\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness dj196 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_dj343\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness dj343 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_etc\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness etc and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_js282\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness js282 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punc_lp\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word in an rdg for witness lp and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "punca\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word outside as well as anywhere inside an apparatus\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncb\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word of the base text outside as well as inside an apparatus\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncr\n", - "
\n", - "
str
\n", - "\n", - " nonword chars after a word inside rdg and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_151\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness 151 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_249\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness 249 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_bhd1\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness bhd1 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_bhd2\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness bhd2 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_bhd20\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness bhd20 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_bhd25\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness bhd25 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_bhd26\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness bhd26 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_bhd5\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness bhd5 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_bhd7\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness bhd7 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_bt\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness bt else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_d1\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d1 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_d10\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d10 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_d11\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d11 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_d12\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d12 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_d14\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d14 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_d15\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d15 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_d16\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d16 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_d2\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d2 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_d22\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d22 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_d23\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d23 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_d24\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d24 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_d3\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d3 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_d4\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d4 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_d5\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d5 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_d6\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d6 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_d7\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d7 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_dj\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness dj else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_dj-1\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness dj-1 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_dj-2\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness dj-2 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_dj-3\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness dj-3 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_dj138\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness dj138 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_dj151\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness dj151 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_dj196\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness dj196 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_dj343\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness dj343 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_etc\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness etc else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_js282\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness js282 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "puncx_lp\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness lp else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "ref\n", - "
\n", - "
str
\n", - "\n", - " reference to a witness (in lem and rdg)\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "reg\n", - "
\n", - "
str
\n", - "\n", - " regular form of something, e.g. a name\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "resp\n", - "
\n", - "
str
\n", - "\n", - " person responsible for something (note or rdg)\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "status\n", - "
\n", - "
str
\n", - "\n", - " certainty of something (note, seg)\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "subtype\n", - "
\n", - "
str
\n", - "\n", - " subtype of a seg\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "title\n", - "
\n", - "
str
\n", - "\n", - " title of a work\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word outside apparatus and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in lem and not in any rdg; for words in rdgs it is the empty string, and it is undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_151\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness 151 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_249\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness 249 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_bhd1\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness bhd1 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_bhd2\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness bhd2 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_bhd20\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness bhd20 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_bhd25\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness bhd25 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_bhd26\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness bhd26 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_bhd5\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness bhd5 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_bhd7\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness bhd7 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_bt\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness bt and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_d1\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness d1 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_d10\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness d10 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_d11\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness d11 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_d12\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness d12 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_d14\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness d14 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_d15\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness d15 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_d16\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness d16 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_d2\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness d2 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_d22\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness d22 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_d23\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness d23 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_d24\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness d24 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_d3\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness d3 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_d4\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness d4 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_d5\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness d5 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_d6\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness d6 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_d7\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness d7 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_dj\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness dj and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_dj-1\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness dj-1 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_dj-2\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness dj-2 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_dj-3\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness dj-3 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_dj138\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness dj138 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_dj151\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness dj151 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_dj196\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness dj196 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_dj343\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness dj343 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_etc\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness etc and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_js282\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness js282 and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "trans_lp\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word in an rdg for witness lp and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transa\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word outside as well as anywhere inside an apparatus\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transb\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word of the base text outside as well as inside an apparatus\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transr\n", - "
\n", - "
str
\n", - "\n", - " transcription of a word inside rdg and undefined elsewhere\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_151\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness 151 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_249\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness 249 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_bhd1\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness bhd1 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_bhd2\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness bhd2 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_bhd20\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness bhd20 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_bhd25\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness bhd25 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_bhd26\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness bhd26 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_bhd5\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness bhd5 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_bhd7\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness bhd7 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_bt\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness bt else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_d1\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d1 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_d10\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d10 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_d11\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d11 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_d12\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d12 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_d14\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d14 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_d15\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d15 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_d16\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d16 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_d2\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d2 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_d22\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d22 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_d23\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d23 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_d24\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d24 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_d3\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d3 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_d4\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d4 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_d5\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d5 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_d6\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d6 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_d7\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness d7 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_dj\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness dj else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_dj-1\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness dj-1 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_dj-2\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness dj-2 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_dj-3\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness dj-3 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_dj138\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness dj138 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_dj151\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness dj151 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_dj196\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness dj196 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_dj343\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness dj343 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_etc\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness etc else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_js282\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness js282 else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "transx_lp\n", - "
\n", - "
str
\n", - "\n", - " empty in a lem if there is a related rdg for this witness lp else undefined\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "typ\n", - "
\n", - "
str
\n", - "\n", - " type of a word. If present it is blank line and indicates a blank line at that position\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "type\n", - "
\n", - "
str
\n", - "\n", - " ype of something (head, milestone, name, note, pb, q, rdg, seg)\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "wit\n", - "
\n", - "
str
\n", - "\n", - " list of witnesses of an rdg element\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "oslots\n", - "
\n", - "
none
\n", - "\n", - " \n", - "\n", - "
\n", - "\n", - "
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "" ], @@ -6441,7 +1067,7 @@ "text/html": [ "\n", "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Text-Fabric API: names N F E L T S C TF directly usable

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "data: ~/github/CLARIAH/descartes-tf/source/illustrations" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Found 5 symbols
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Found 310 illustrations
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "A = use(\n", + " \"CLARIAH/descartes-tf:clone\",\n", + " checkout=\"clone\",\n", + " hoist=globals(),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "d0e8e576-9e8a-4183-94e0-5553d3008be3", + "metadata": { + "tags": [] + }, + "source": [ + "# What have we got?\n", + "\n", + "Let's inspect the data.\n", + "\n", + "The text is represented as nodes with properties. The first word is node 1, the second word is node 2, and so on.\n", + "After the last word node we get nodes for the elements, such a p, formula. We also have nodes for letters and volumes.\n", + "\n", + "All nodes can be dressed up with *features*.\n", + "A feature is a piece of data that specifies values for nodes.\n", + "\n", + "For example, the feature `trans` gives the text of each word node, and the feature `punc` gives the text after a word but before the next word.\n", + "\n", + "This gives a very crude insight in the data that Text-Fabric works with. Text-Fabric is a machine\n", + "that can weave the orginal text out of the threads given by the features.\n", + "\n", + "Think of the nodes as the warp, through which the features are woven as wefts.\n", + "See also the [fabric metaphor](https://annotation.github.io/text-fabric/tf/about/datamodel.html#fabric-metaphor).\n", + "\n", + "But it can also weave all kinds of other things out of the data.\n", + "\n", + "We can get a stock overview of the ware house of nodes and features as follows:\n", + "\n", + "* **features** if you click on the little triangle before **Descartes = Descartes, all letters** above,\n", + " you'll see a list of features with their descriptions:\n", + " * you can see which features have been loaded;\n", + " * if you click on a feature name, you find its documentation;\n", + " * if you hover over a name, you see where the feature is located on your system;\n", + " * edge features are marked by **_bold italic_** formatting.\n", + "* **nodes** we show an inventory using\n", + " [`C.levels.data`](https://annotation.github.io/text-fabric/tf/cheatsheet.html#c-computed-data-components)" + ] + }, + { + "cell_type": "markdown", + "id": "63daf300-ebf1-485c-aca1-4eceb7997def", + "metadata": {}, + "source": [ + "# Counting\n", + "We count all nodes, of any type." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ede0a9de-2e8c-47de-993d-924dd6375ec9", + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-18T09:17:43.894153Z", + "start_time": "2018-05-18T09:17:43.597128Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0.00s Counting nodes ...\n", + " 0.06s 722766 nodes\n" + ] + } + ], + "source": [ + "A.indent(reset=True)\n", + "A.info(\"Counting nodes ...\")\n", + "\n", + "i = 0\n", + "for n in N.walk():\n", + " i += 1\n", + "\n", + "A.info(\"{} nodes\".format(i))" + ] + }, + { + "cell_type": "markdown", + "id": "8ce2aaca-184c-4917-91ec-e39b345a0529", + "metadata": {}, + "source": [ + "# Node types\n", + "\n", + "What is the basic textual unit in this corpus?" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "65ffa665-67e3-4fe0-a5e1-ef210fbae2c7", + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-18T09:17:47.820323Z", + "start_time": "2018-05-18T09:17:47.812328Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'word'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "F.otype.slotType" + ] + }, + { + "cell_type": "markdown", + "id": "827680c2-3c27-4ac4-b0c9-0c5e08055df0", + "metadata": {}, + "source": [ + "A quick way to list all node types:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b85d5e3c-140a-4522-85d6-df31ed9d408a", + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-18T09:17:49.922863Z", + "start_time": "2018-05-18T09:17:49.916078Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('volume',\n", + " 'letter',\n", + " 'page',\n", + " 'postscriptum',\n", + " 'opener',\n", + " 'closer',\n", + " 'address',\n", + " 'head',\n", + " 'p',\n", + " 'sentence',\n", + " 'hi',\n", + " 'formula',\n", + " 'figure',\n", + " 'word')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "F.otype.all" + ] + }, + { + "cell_type": "markdown", + "id": "2db2a7e2-e011-4fd8-9701-1f2ad0e1aee2", + "metadata": {}, + "source": [ + "# Checks and balances\n", + "\n", + "Let's collect a the words outside any page, if any:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ea10deda-665a-4476-b778-e96a8a9765a7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 page outsiders\n" + ] + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "outsiders = []\n", + "\n", + "for w in F.otype.s(\"word\"):\n", + " if not L.u(w, otype=\"page\"):\n", + " outsiders.append((w,))\n", + " if len(outsiders) > 10:\n", + " break\n", + "\n", + "print(f\"{len(outsiders)} page outsiders\")\n", + "A.table(outsiders, withNodes=True)" + ] + }, + { + "cell_type": "markdown", + "id": "c8194a07-913c-4fb3-8b64-4451a2274cf1", + "metadata": {}, + "source": [ + "# Word matters\n", + "\n", + "We can only work with the surface forms of words, there is no concept of lexeme in the corpus (yet).\n", + "\n", + "## Top 30 frequent words\n", + "\n", + "There is a simple function to get a frequency list of feature values.\n", + "Here we call it for the feature `transa`, which contains the text for every word in the base text and in every variant:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ad775575-3b68-4a22-850b-68fe3ceb1def", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 24103 de\n", + " 20661 \n", + " 18240 que\n", + " 14184 et\n", + " 11833 la\n", + " 10493 en\n", + " 10368 à\n", + " 9924 qu\n", + " 9354 il\n", + " 8407 je\n", + " 8225 l\n", + " 8162 est\n", + " 7933 le\n", + " 7629 qui\n", + " 7214 ne\n", + " 7139 vous\n", + " 7048 les\n", + " 5726 d\n", + " 5511 ce\n", + " 4633 n\n", + " 4597 pour\n", + " 4173 a\n", + " 3838 plus\n", + " 3821 si\n", + " 3748 un\n", + " 3659 pas\n", + " 3545 des\n", + " 3438 j\n", + " 3396 par\n", + " 3393 me\n" + ] + } + ], + "source": [ + "for (word, amount) in F.trans.freqList()[0:30]:\n", + " print(f\"{amount:>6} {word}\")" + ] + }, + { + "cell_type": "markdown", + "id": "b29cf9ec-7ea1-486f-9578-597dcabf8e7a", + "metadata": {}, + "source": [ + "# Words that are unique to a letter\n", + "\n", + "Are there words that are unique to a letter?\n", + "And if so, which letter has the most of them?\n", + "That letter is the most idiosyncratic letter.\n", + "\n", + "Task: list the letters in a table sorted by degree of idiosyncrasy, and show the\n", + "idiosyncrasy of each letter.\n", + "\n", + "## Method\n", + "\n", + "For each word, the support base is the set of letters in which the word occurs.\n", + "We take only distinct words into account when we count words.\n", + "We make all words lower case.\n", + "\n", + "Let's compute the support base of all words.\n", + "\n", + "We also need to count how much distinct words each letter contains.\n", + "\n", + "And we also want to find out how many hapaxes there are, so we also make an\n", + "index for the occurrences of each word form." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "0dbae8c1-1bb1-4a62-b105-14b43be5b022", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 38034 distinct words\n" + ] + } + ], + "source": [ + "wordOccs = collections.defaultdict(list)\n", + "wordsByLetter = collections.defaultdict(set)\n", + "supportBase = collections.defaultdict(set)\n", + "\n", + "for letter in F.otype.s(\"letter\"):\n", + " for w in L.d(letter, otype=\"word\"):\n", + " word = F.trans.v(w)\n", + " if not word:\n", + " continue\n", + " \n", + " wordOccs[word].append(w)\n", + " wordsByLetter[letter].add(word)\n", + " supportBase[word].add(letter)\n", + " \n", + "print(f\"There are {len(wordOccs)} distinct words\")" + ] + }, + { + "cell_type": "markdown", + "id": "40882651-877e-4ffd-80eb-8ec29f5ed9a5", + "metadata": {}, + "source": [ + "We can find the hapaxes as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "b95b3358-e03a-4884-87d7-c2515428bbd8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 19326 hapaxes\n" + ] + } + ], + "source": [ + "hapaxes = {word for (word, occs) in wordOccs.items() if len(occs) == 1}\n", + "\n", + "print(f\"There are {len(hapaxes)} hapaxes\")" + ] + }, + { + "cell_type": "markdown", + "id": "995227bf-15aa-4678-995f-aa2f7fdf3b5c", + "metadata": {}, + "source": [ + "In the same way we can find the idiosyncratic words:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "f8335a08-36c8-439d-8e27-7c279d3b8d5b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 20798 idiosyncratic words\n" + ] + } + ], + "source": [ + "idiosyncraticWords = {word for (word, letters) in supportBase.items() if len(letters) == 1}\n", + "\n", + "print(f\"There are {len(idiosyncraticWords)} idiosyncratic words\")" + ] + }, + { + "cell_type": "markdown", + "id": "7f249d83-60aa-4d3e-bc39-979db26ec603", + "metadata": {}, + "source": [ + "Now we can make a table of the letters where for each letter we list the total\n", + "amount of distinct words, the amount of idiosyncratic words,\n", + "and the percentage of idiosyncratic words wrt. to the total number of words." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "72efbbda-c8ec-4619-9e67-ac501bd20744", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('1001', 275, 61, 22),\n", + " ('1002', 504, 123, 24),\n", + " ('1003', 77, 8, 10),\n", + " ('1004', 240, 48, 20),\n", + " ('1005', 250, 47, 19),\n", + " ('1006', 363, 94, 26),\n", + " ('1007', 112, 11, 10),\n", + " ('1008', 122, 17, 14),\n", + " ('1009', 128, 10, 8),\n", + " ('1010', 182, 8, 4)]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table = []\n", + "\n", + "for letter in F.otype.s(\"letter\"):\n", + " letterId = F.id.v(letter)\n", + " words = wordsByLetter[letter]\n", + " idio = {word for word in words if word in idiosyncraticWords}\n", + " \n", + " nWords = len(words)\n", + " nIdio = len(idio)\n", + " perc = int(round(100 * nIdio / nWords))\n", + " \n", + " table.append((letterId, nWords, nIdio, perc))\n", + " \n", + "table[0:10]" + ] + }, + { + "cell_type": "markdown", + "id": "cbfe510b-3f37-4ab8-b7a4-6f6cf705290a", + "metadata": {}, + "source": [ + "We can make that prettier by rendering it in Markdown.\n", + "And we have to sort it on the percentage column.\n", + "And we add a grand total.\n", + "\n", + "We do not show the letters that have less than 20% idiosyncratic words." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "58a47257-47b6-40ac-8ea3-74b775b3d05e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "\n", + "letter | #words | #idio | %perc\n", + "--- | --- | --- | ---\n", + "6391 | 412 | 271 | 66\n", + "6431 | 84 | 40 | 48\n", + "8648 | 1249 | 408 | 33\n", + "1012 | 738 | 241 | 33\n", + "6394 | 120 | 40 | 33\n", + "1032 | 1272 | 404 | 32\n", + "6425 | 1203 | 389 | 32\n", + "4280 | 752 | 240 | 32\n", + "4233 | 510 | 157 | 31\n", + "5382 | 288 | 90 | 31\n", + "8661 | 1888 | 568 | 30\n", + "7588 | 1238 | 367 | 30\n", + "6420 | 192 | 58 | 30\n", + "3226 | 178 | 53 | 30\n", + "7543 | 2078 | 574 | 28\n", + "5307 | 1503 | 407 | 27\n", + "1006 | 363 | 94 | 26\n", + "1116 | 251 | 62 | 25\n", + "2139 | 1312 | 312 | 24\n", + "4251 | 979 | 239 | 24\n", + "8681 | 910 | 222 | 24\n", + "4243 | 823 | 200 | 24\n", + "5335 | 768 | 184 | 24\n", + "1002 | 504 | 123 | 24\n", + "2149 | 682 | 160 | 23\n", + "4303 | 374 | 87 | 23\n", + "5341 | 259 | 60 | 23\n", + "5383 | 79 | 18 | 23\n", + "2120 | 1621 | 363 | 22\n", + "1117 | 906 | 198 | 22\n", + "2150 | 838 | 188 | 22\n", + "2132 | 418 | 90 | 22\n", + "4234 | 323 | 72 | 22\n", + "1001 | 275 | 61 | 22\n", + "5327 | 2028 | 422 | 21\n", + "5318 | 311 | 65 | 21\n", + "4260 | 759 | 151 | 20\n", + "1066 | 604 | 122 | 20\n", + "1004 | 240 | 48 | 20\n", + "6453 | 123 | 25 | 20\n", + "**725** letters | **38034** | **20798** | **55**\n", + "**725** letters | **262183** | **20798** | **8**\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "md = \"\"\"\n", + "letter | #words | #idio | %perc\n", + "--- | --- | --- | ---\n", + "\"\"\"\n", + "\n", + "totalNw = 0\n", + "\n", + "for (letter, nw, ni, per) in sorted(table, key=lambda x: (-x[-1], -x[-2], x[1], x[0])):\n", + " if per >= 20:\n", + " md += f\"\"\"{letter} | {nw} | {ni} | {per}\\n\"\"\"\n", + " totalNw += nw\n", + " \n", + " \n", + "overall = int(round(100 * len(idiosyncraticWords) / len(wordOccs)))\n", + "overall2 = int(round(100 * len(idiosyncraticWords) / totalNw))\n", + "md += f\"\"\"**{len(table)}** letters | **{len(wordOccs)}** | **{len(idiosyncraticWords)}** | **{overall}**\\n\"\"\"\n", + "md += f\"\"\"**{len(table)}** letters | **{totalNw}** | **{len(idiosyncraticWords)}** | **{overall2}**\\n\"\"\"\n", + "\n", + "A.dm(md)" + ] + }, + { + "cell_type": "markdown", + "id": "77b1e6d6-01c6-4bfa-b938-9d5f96e3e2d8", + "metadata": {}, + "source": [ + "It might seem strange that the overall idiosyncracy is much bigger than the idiosyncracy of the individual\n", + "chapters.\n", + "\n", + "This follows from the fact that if we take the amounts of distinct words per chapter and take the sum of that,\n", + "we end up with a much bigger number than the total amount of distinct words in the whole book.\n", + "\n", + "Because words that occur in multiple chapters are counted multiple times.\n", + "\n", + "If we use the sum of the per-chapter distinct words, the total idiosyncracy is the weighted average of the chapter\n", + "idiosyncracies." + ] + }, + { + "cell_type": "markdown", + "id": "09577595-dbb0-4322-a61f-1615613325d3", + "metadata": { + "tags": [] + }, + "source": [ + "---\n", + "\n", + "# Contents\n", + "\n", + "* **[start](start.ipynb)** intro and highlights\n", + "* **search** turbo charge your hand-coding with search templates\n", + "* **[compute](compute.ipynb)** sink down a level and compute it yourself\n", + "* **[exportExcel](exportExcel.ipynb)** make tailor-made spreadsheets out of your results\n", + "\n", + "Advanced\n", + "\n", + "* **[similar sentences](similar.ipynb)** find similar sentences\n", + "\n", + "CC-BY Dirk Roorda" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorial/exportExcel.ipynb b/tutorial/exportExcel.ipynb new file mode 100644 index 0000000..52a0549 --- /dev/null +++ b/tutorial/exportExcel.ipynb @@ -0,0 +1,1321 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "\n", + "\n", + "\n", + "\n", + "To get started: consult [start](start.ipynb)\n", + "\n", + "---\n", + "\n", + "# Export to Excel\n", + "\n", + "Sometimes you want to use the convenience of Excel.\n", + "\n", + "Here are ways to put your data into a spreadsheet.\n", + "\n", + "In fact, what we produce are *tab-separated* files that open easily in \n", + "Excel, Numbers, or any spreadsheet app, including ordinary text-editors." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from tf.app import use" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "TF-app: ~/github/CLARIAH/descartes-tf/app" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "data: ~/github/CLARIAH/descartes-tf/tf/1.0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "data: ~/github/CLARIAH/descartes-tf/parallels/tf/1.0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is Text-Fabric 11.0.7\n", + "Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html\n", + "\n", + "28 features found and 0 ignored\n", + " 0.09s Dataset without structure sections in otext:no structure functions in the T-API\n", + " 0.34s All features loaded/computed - for details use TF.isLoaded()\n", + " 0.01s All additional features loaded - for details use TF.isLoaded()\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " Text-Fabric: Text-Fabric API 11.0.7, CLARIAH/descartes-tf/app v3, Search Reference
\n", + " Data: DESCARTES-TF, Character table, Feature docs
\n", + "
Node types\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
Name# of nodes# slots/node% coverage
volume885241.88100
letter725940.60100
page2884236.45100
postscriptum5646.790
opener5451.970
closer54113.101
address8615.220
head72523.372
p843880.82100
sentence1433245.7496
hi59724.634
formula62001.211
figure3191.000
word6819351.00100
\n", + " Sets: no custom sets
\n", + " Features:
\n", + "
Similar Sentences\n", + "
\n", + "\n", + "
\n", + "
\n", + "sim\n", + "
\n", + "
int
\n", + "\n", + " similarity between sentences based on the Levenshtein ratio\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
Descartes = Descartes, all letters\n", + "
\n", + "\n", + "
\n", + "
\n", + "alt_date\n", + "
\n", + "
str
\n", + "\n", + " alternative date of a letter\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "alt_id\n", + "
\n", + "
str
\n", + "\n", + " alternative ids of a letter, comma separated\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "cert\n", + "
\n", + "
str
\n", + "\n", + " certainty of something\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "date\n", + "
\n", + "
str
\n", + "\n", + " date of a letter\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "id\n", + "
\n", + "
str
\n", + "\n", + " id of a letter\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "intermediary\n", + "
\n", + "
str
\n", + "\n", + " person involved in the transmission of the letter from sender to receiver\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "isitalic\n", + "
\n", + "
str
\n", + "\n", + " whether the word is in italic\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "ismargin\n", + "
\n", + "
str
\n", + "\n", + " whether the word is in the margin\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "issub\n", + "
\n", + "
str
\n", + "\n", + " whether the word is in subscript\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "issup\n", + "
\n", + "
str
\n", + "\n", + " whether the word is in supscript\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "language\n", + "
\n", + "
str
\n", + "\n", + " language of a letter\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "level\n", + "
\n", + "
str
\n", + "\n", + " level of a paragraph when it acts like a heading\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "n\n", + "
\n", + "
int
\n", + "\n", + " number of whatever element\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "notation\n", + "
\n", + "
str
\n", + "\n", + " notation method of a formula\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "otype\n", + "
\n", + "
str
\n", + "\n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "punc\n", + "
\n", + "
str
\n", + "\n", + " nonword chars after a word \n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "recipient\n", + "
\n", + "
str
\n", + "\n", + " recipient of a letter\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "recipientloc\n", + "
\n", + "
str
\n", + "\n", + " location from where a letter was received\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "resp\n", + "
\n", + "
str
\n", + "\n", + " person responsible for something\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "sender\n", + "
\n", + "
str
\n", + "\n", + " sender of a letter\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "senderloc\n", + "
\n", + "
str
\n", + "\n", + " location from where a letter was sent\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "tex\n", + "
\n", + "
str
\n", + "\n", + " unformatted TeX code of a formula, without the `$`\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "trans\n", + "
\n", + "
str
\n", + "\n", + " transcription of a word \n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "typ\n", + "
\n", + "
str
\n", + "\n", + " kind of a node; \"empty\"; \"formula\", \"head\", \"symbol\", \"illustration\"\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "url\n", + "
\n", + "
str
\n", + "\n", + " url of a graphic node\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "oslots\n", + "
\n", + "
none
\n", + "\n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Text-Fabric API: names N F E L T S C TF directly usable

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "data: ~/github/CLARIAH/descartes-tf/source/illustrations" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Found 5 symbols
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Found 310 illustrations
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "A = use(\"CLARIAH/descartes-tf:clone\", checkout=\"clone\", hoist=globals())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We are going to export all TeX formula elements to file.\n", + "First we query them together and have a quick preview:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0.00s 219 results\n" + ] + } + ], + "source": [ + "results = A.search(\"\"\"\n", + "formula notation=TeX\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
npformula
11 1046:11 ${1\\over 3} {4\\over 9} {16\\over 27} {64\\over 81}$
21 1060:3 $4.900x^{6} \\ {\\it aequat}\\ - 4.899x^{5} + 2.354x^{4} + 16.858x^{3} + 9.458xx + 429x - 4.900$
31 1060:9 ${\\displaystyle\\strut {3xx - 1x}\\over \\displaystyle\\strut 2}$
42 2131:4 ${\\displaystyle\\strut Bq\\over \\displaystyle\\strut D}$
52 2131:4 ${\\displaystyle\\strut {Bq \\ {\\it in}\\ D + Bq \\ {\\it in}\\ E}\\over \\displaystyle\\strut {\\rm D}}$
62 2131:4 $Bq + {\\displaystyle\\strut {Bq \\ {\\it in}\\ E}\\over \\displaystyle\\strut D} + Aq + A \\ {\\it in}\\ E \\ {\\it bis}\\ + Eq$
72 2131:4 ${\\displaystyle\\strut {Bq \\ {\\it in}\\ E}\\over \\displaystyle\\strut D} + A \\ {\\it in}\\ E \\ {\\it bis}\\ + Eq$
82 2131:4 ${\\displaystyle\\strut Bq\\over \\displaystyle\\strut D} + A \\ {\\it bis}\\ + E$
92 2131:4 ${\\displaystyle\\strut Bq\\over \\displaystyle\\strut D} + A \\ {\\it bis}\\ $
102 2152:4 ${\\rm m} - {{\\displaystyle\\strut {\\rm n}\\over \\displaystyle\\strut {\\rm z}}}{\\rm x} + \\sqrt{{\\rm mm} + {\\rm ox} - {{\\displaystyle\\strut {\\rm p}\\over \\displaystyle\\strut {\\rm m}}}{\\rm xx}}$
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "A.table(results, end=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we export them to file." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "A.export(results)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "But where are they? Here:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "��R\u0000\t\u0000S\u00001\u0000\t\u0000S\u00002\u0000\t\u0000S\u00003\u0000\t\u0000N\u0000O\u0000D\u0000E\u00001\u0000\t\u0000T\u0000Y\u0000P\u0000E\u00001\u0000\t\u0000T\u0000E\u0000X\u0000T\u00001\u0000\t\u0000n\u0000o\u0000t\u0000a\u0000t\u0000i\u0000o\u0000n\u00001\u0000\n", + "\u00001\u0000\t\u00001\u0000\t\u00001\u00000\u00004\u00006\u0000\t\u00001\u00001\u0000\t\u00006\u00008\u00003\u00004\u00008\u00000\u0000\t\u0000f\u0000o\u0000r\u0000m\u0000u\u0000l\u0000a\u0000\t\u0000$\u0000{\u00001\u0000\\\u0000o\u0000v\u0000e\u0000r\u0000 \u00003\u0000}\u0000 \u0000{\u00004\u0000\\\u0000o\u0000v\u0000e\u0000r\u0000 \u00009\u0000}\u0000 \u0000{\u00001\u00006\u0000\\\u0000o\u0000v\u0000e\u0000r\u0000 \u00002\u00007\u0000}\u0000 \u0000{\u00006\u00004\u0000\\\u0000o\u0000v\u0000e\u0000r\u0000 \u00008\u00001\u0000}\u0000$\u0000 \u0000\t\u0000T\u0000e\u0000X\u0000\n", + "\u00002\u0000\t\u00001\u0000\t\u00001\u00000\u00006\u00000\u0000\t\u00003\u0000\t\u00006\u00008\u00003\u00005\u00006\u00008\u0000\t\u0000f\u0000o\u0000r\u0000m\u0000u\u0000l\u0000a\u0000\t\u0000$\u00004\u0000.\u00009\u00000\u00000\u0000x\u0000^\u0000{\u00006\u0000}\u0000 \u0000\\\u0000 \u0000{\u0000\\\u0000i\u0000t\u0000 \u0000a\u0000e\u0000q\u0000u\u0000a\u0000t\u0000}\u0000\\\u0000 \u0000 \u0000-\u0000 \u00004\u0000.\u00008\u00009\u00009\u0000x\u0000^\u0000{\u00005\u0000}\u0000 \u0000+\u0000 \u00002\u0000.\u00003\u00005\u00004\u0000x\u0000^\u0000{\u00004\u0000}\u0000 \u0000+\u0000 \u00001\u00006\u0000.\u00008\u00005\u00008\u0000x\u0000^\u0000{\u00003\u0000}\u0000 \u0000+\u0000 \u00009\u0000.\u00004\u00005\u00008\u0000x\u0000x\u0000 \u0000+\u0000 \u00004\u00002\u00009\u0000x\u0000 \u0000-\u0000 \u00004\u0000.\u00009\u00000\u00000\u0000$\u0000 \u0000\t\u0000T\u0000e\u0000X\u0000\n", + "\u00003\u0000\t\u00001\u0000\t\u00001\u00000\u00006\u00000\u0000\t\u00009\u0000\t\u00006\u00008\u00003\u00005\u00007\u00007\u0000\t\u0000f\u0000o\u0000r\u0000m\u0000u\u0000l\u0000a\u0000\t\u0000$\u0000{\u0000\\\u0000d\u0000i\u0000s\u0000p\u0000l\u0000a\u0000y\u0000s\u0000t\u0000y\u0000l\u0000e\u0000\\\u0000s\u0000t\u0000r\u0000u\u0000t\u0000 \u0000{\u00003\u0000x\u0000x\u0000 \u0000-\u0000 \u00001\u0000x\u0000}\u0000\\\u0000o\u0000v\u0000e\u0000r\u0000 \u0000\\\u0000d\u0000i\u0000s\u0000p\u0000l\u0000a\u0000y\u0000s\u0000t\u0000y\u0000l\u0000e\u0000\\\u0000s\u0000t\u0000r\u0000u\u0000t\u0000 \u00002\u0000}\u0000$\u0000 \u0000\t\u0000T\u0000e\u0000X\u0000\n", + "\u00004\u0000\t\u00002\u0000\t\u00002\u00001\u00003\u00001\u0000\t\u00004\u0000\t\u00006\u00008\u00004\u00001\u00001\u00009\u0000\t\u0000f\u0000o\u0000r\u0000m\u0000u\u0000l\u0000a\u0000\t\u0000$\u0000{\u0000\\\u0000d\u0000i\u0000s\u0000p\u0000l\u0000a\u0000y\u0000s\u0000t\u0000y\u0000l\u0000e\u0000\\\u0000s\u0000t\u0000r\u0000u\u0000t\u0000 \u0000B\u0000q\u0000\\\u0000o\u0000v\u0000e\u0000r\u0000 \u0000\\\u0000d\u0000i\u0000s\u0000p\u0000l\u0000a\u0000y\u0000s\u0000t\u0000y\u0000l\u0000e\u0000\\\u0000s\u0000t\u0000r\u0000u\u0000t\u0000 \u0000D\u0000}\u0000$\u0000 \u0000\t\u0000T\u0000e\u0000X\u0000\n", + "\u00005\u0000\t\u00002\u0000\t\u00002\u00001\u00003\u00001\u0000\t\u00004\u0000\t\u00006\u00008\u00004\u00001\u00002\u00008\u0000\t\u0000f\u0000o\u0000r\u0000m\u0000u\u0000l\u0000a\u0000\t\u0000$\u0000{\u0000\\\u0000d\u0000i\u0000s\u0000p\u0000l\u0000a\u0000y\u0000s\u0000t\u0000y\u0000l\u0000e\u0000\\\u0000s\u0000t\u0000r\u0000u\u0000t\u0000 \u0000{\u0000B\u0000q\u0000 \u0000\\\u0000 \u0000{\u0000\\\u0000i\u0000t\u0000 \u0000i\u0000n\u0000}\u0000\\\u0000 \u0000 \u0000D\u0000 \u0000+\u0000 \u0000B\u0000q\u0000 \u0000\\\u0000 \u0000{\u0000\\\u0000i\u0000t\u0000 \u0000i\u0000n\u0000}\u0000\\\u0000 \u0000 \u0000E\u0000}\u0000\\\u0000o\u0000v\u0000e\u0000r\u0000 \u0000\\\u0000d\u0000i\u0000s\u0000p\u0000l\u0000a\u0000y\u0000s\u0000t\u0000y\u0000l\u0000e\u0000\\\u0000s\u0000t\u0000r\u0000u\u0000t\u0000 \u0000{\u0000\\\u0000r\u0000m\u0000 \u0000D\u0000}\u0000}\u0000$\u0000 \u0000\t\u0000T\u0000e\u0000X\u0000\n", + "\u00006\u0000\t\u00002\u0000\t\u00002\u00001\u00003\u00001\u0000\t\u00004\u0000\t\u00006\u00008\u00004\u00001\u00003\u00003\u0000\t\u0000f\u0000o\u0000r\u0000m\u0000u\u0000l\u0000a\u0000\t\u0000$\u0000B\u0000q\u0000 \u0000+\u0000 \u0000{\u0000\\\u0000d\u0000i\u0000s\u0000p\u0000l\u0000a\u0000y\u0000s\u0000t\u0000y\u0000l\u0000e\u0000\\\u0000s\u0000t\u0000r\u0000u\u0000t\u0000 \u0000{\u0000B\u0000q\u0000 \u0000\\\u0000 \u0000{\u0000\\\u0000i\u0000t\u0000 \u0000i\u0000n\u0000}\u0000\\\u0000 \u0000 \u0000E\u0000}\u0000\\\u0000o\u0000v\u0000e\u0000r\u0000 \u0000\\\u0000d\u0000i\u0000s\u0000p\u0000l\u0000a\u0000y\u0000s\u0000t\u0000y\u0000l\u0000e\u0000\\\u0000s\u0000t\u0000r\u0000u\u0000t\u0000 \u0000D\u0000}\u0000 \u0000+\u0000 \u0000A\u0000q\u0000 \u0000+\u0000 \u0000A\u0000 \u0000\\\u0000 \u0000{\u0000\\\u0000i\u0000t\u0000 \u0000i\u0000n\u0000}\u0000\\\u0000 \u0000 \u0000E\u0000 \u0000\\\u0000 \u0000{\u0000\\\u0000i\u0000t\u0000 \u0000b\u0000i\u0000s\u0000}\u0000\\\u0000 \u0000 \u0000+\u0000 \u0000E\u0000q\u0000$\u0000 \u0000\t\u0000T\u0000e\u0000X\u0000\n", + "\u00007\u0000\t\u00002\u0000\t\u00002\u00001\u00003\u00001\u0000\t\u00004\u0000\t\u00006\u00008\u00004\u00001\u00003\u00004\u0000\t\u0000f\u0000o\u0000r\u0000m\u0000u\u0000l\u0000a\u0000\t\u0000$\u0000{\u0000\\\u0000d\u0000i\u0000s\u0000p\u0000l\u0000a\u0000y\u0000s\u0000t\u0000y\u0000l\u0000e\u0000\\\u0000s\u0000t\u0000r\u0000u\u0000t\u0000 \u0000{\u0000B\u0000q\u0000 \u0000\\\u0000 \u0000{\u0000\\\u0000i\u0000t\u0000 \u0000i\u0000n\u0000}\u0000\\\u0000 \u0000 \u0000E\u0000}\u0000\\\u0000o\u0000v\u0000e\u0000r\u0000 \u0000\\\u0000d\u0000i\u0000s\u0000p\u0000l\u0000a\u0000y\u0000s\u0000t\u0000y\u0000l\u0000e\u0000\\\u0000s\u0000t\u0000r\u0000u\u0000t\u0000 \u0000D\u0000}\u0000 \u0000+\u0000 \u0000A\u0000 \u0000\\\u0000 \u0000{\u0000\\\u0000i\u0000t\u0000 \u0000i\u0000n\u0000}\u0000\\\u0000 \u0000 \u0000E\u0000 \u0000\\\u0000 \u0000{\u0000\\\u0000i\u0000t\u0000 \u0000b\u0000i\u0000s\u0000}\u0000\\\u0000 \u0000 \u0000+\u0000 \u0000E\u0000q\u0000$\u0000 \u0000\t\u0000T\u0000e\u0000X\u0000\n", + "\u00008\u0000\t\u00002\u0000\t\u00002\u00001\u00003\u00001\u0000\t\u00004\u0000\t\u00006\u00008\u00004\u00001\u00003\u00006\u0000\t\u0000f\u0000o\u0000r\u0000m\u0000u\u0000l\u0000a\u0000\t\u0000$\u0000{\u0000\\\u0000d\u0000i\u0000s\u0000p\u0000l\u0000a\u0000y\u0000s\u0000t\u0000y\u0000l\u0000e\u0000\\\u0000s\u0000t\u0000r\u0000u\u0000t\u0000 \u0000B\u0000q\u0000\\\u0000o\u0000v\u0000e\u0000r\u0000 \u0000\\\u0000d\u0000i\u0000s\u0000p\u0000l\u0000a\u0000y\u0000s\u0000t\u0000y\u0000l\u0000e\u0000\\\u0000s\u0000t\u0000r\u0000u\u0000t\u0000 \u0000D\u0000}\u0000 \u0000+\u0000 \u0000A\u0000 \u0000\\\u0000 \u0000{\u0000\\\u0000i\u0000t\u0000 \u0000b\u0000i\u0000s\u0000}\u0000\\\u0000 \u0000 \u0000+\u0000 \u0000E\u0000$\u0000 \u0000\t\u0000T\u0000e\u0000X\u0000\n", + "\u00009\u0000\t\u00002\u0000\t\u00002\u00001\u00003\u00001\u0000\t\u00004\u0000\t\u00006\u00008\u00004\u00001\u00003\u00008\u0000\t\u0000f\u0000o\u0000r\u0000m\u0000u\u0000l\u0000a\u0000\t\u0000$\u0000{\u0000\\\u0000d\u0000i\u0000s\u0000p\u0000l\u0000a\u0000y\u0000s\u0000t\u0000y\u0000l\u0000e\u0000\\\u0000s\u0000t\u0000r\u0000u\u0000t\u0000 \u0000B\u0000q\u0000\\\u0000o\u0000v\u0000e\u0000r\u0000 \u0000\\\u0000d\u0000i\u0000s\u0000p\u0000l\u0000a\u0000y\u0000s\u0000t\u0000y\u0000l\u0000e\u0000\\\u0000s\u0000t\u0000r\u0000u\u0000t\u0000 \u0000D\u0000}\u0000 \u0000+\u0000 \u0000A\u0000 \u0000\\\u0000 \u0000{\u0000\\\u0000i\u0000t\u0000 \u0000b\u0000i\u0000s\u0000}\u0000\\\u0000 \u0000$\u0000 \u0000\t\u0000T\u0000e\u0000X\u0000\n" + ] + } + ], + "source": [ + "!head -n 10 ~/Downloads/results.tsv" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "---\n", + "\n", + "# Next steps\n", + "\n", + "By now you have an impression how to orient yourself in the Missieven dataset.\n", + "The next steps will show you how to get powerful: searching and computing.\n", + "\n", + "After that it is time for collecting results, use them in new annotations and share them.\n", + "\n", + "* **[start](start.ipynb)** start computing with this corpus\n", + "* **[search](search.ipynb)** turbo charge your hand-coding with search templates\n", + "* **[compute](compute.ipynb)** sink down a level and compute it yourself\n", + "* **[exportExcel](exportExcel.ipynb)** make tailor-made spreadsheets out of your results\n", + "\n", + "Advanced\n", + "\n", + "* **[similar sentences](similar.ipynb)** find the sentences where Hermans repeats himself\n", + "\n", + "CC-BY Dirk Roorda" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.1" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tutorial/illustrations/illustration-682563.gif b/tutorial/illustrations/illustration-682563.gif new file mode 100644 index 0000000..61fd710 Binary files /dev/null and b/tutorial/illustrations/illustration-682563.gif differ diff --git a/tutorial/illustrations/illustration-682575.gif b/tutorial/illustrations/illustration-682575.gif new file mode 100644 index 0000000..651b766 Binary files /dev/null and b/tutorial/illustrations/illustration-682575.gif differ diff --git a/tutorial/illustrations/illustration-682632.gif b/tutorial/illustrations/illustration-682632.gif new file mode 100644 index 0000000..e32b4a6 Binary files /dev/null and b/tutorial/illustrations/illustration-682632.gif differ diff --git a/tutorial/illustrations/illustration-682657.gif b/tutorial/illustrations/illustration-682657.gif new file mode 100644 index 0000000..1dcc614 Binary files /dev/null and b/tutorial/illustrations/illustration-682657.gif differ diff --git a/tutorial/illustrations/illustration-682658.gif b/tutorial/illustrations/illustration-682658.gif new file mode 100644 index 0000000..886f0b4 Binary files /dev/null and b/tutorial/illustrations/illustration-682658.gif differ diff --git a/tutorial/illustrations/illustration-682659.gif b/tutorial/illustrations/illustration-682659.gif new file mode 100644 index 0000000..1b7fd6d Binary files /dev/null and b/tutorial/illustrations/illustration-682659.gif differ diff --git a/tutorial/illustrations/illustration-682719.gif b/tutorial/illustrations/illustration-682719.gif new file mode 100644 index 0000000..cc11380 Binary files /dev/null and b/tutorial/illustrations/illustration-682719.gif differ diff --git a/tutorial/illustrations/symbol-682564.png b/tutorial/illustrations/symbol-682564.png new file mode 100644 index 0000000..a1db224 Binary files /dev/null and b/tutorial/illustrations/symbol-682564.png differ diff --git a/tutorial/illustrations/symbol-682565.png b/tutorial/illustrations/symbol-682565.png new file mode 100644 index 0000000..13f01a5 Binary files /dev/null and b/tutorial/illustrations/symbol-682565.png differ diff --git a/tutorial/illustrations/symbol-682566.png b/tutorial/illustrations/symbol-682566.png new file mode 100644 index 0000000..13f01a5 Binary files /dev/null and b/tutorial/illustrations/symbol-682566.png differ diff --git a/tutorial/illustrations/symbol-682567.png b/tutorial/illustrations/symbol-682567.png new file mode 100644 index 0000000..13f01a5 Binary files /dev/null and b/tutorial/illustrations/symbol-682567.png differ diff --git a/tutorial/illustrations/symbol-682671.png b/tutorial/illustrations/symbol-682671.png new file mode 100644 index 0000000..db85332 Binary files /dev/null and b/tutorial/illustrations/symbol-682671.png differ diff --git a/tutorial/images/browser.png b/tutorial/images/browser.png new file mode 100644 index 0000000..ad22f95 Binary files /dev/null and b/tutorial/images/browser.png differ diff --git a/tutorial/search.ipynb b/tutorial/search.ipynb new file mode 100644 index 0000000..0403a7e --- /dev/null +++ b/tutorial/search.ipynb @@ -0,0 +1,2005 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4a486875-147d-492a-9003-f05c48d841fc", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "---\n", + "\n", + "To get started: consult [start](start.ipynb)\n", + "\n", + "---\n", + "\n", + "# Search Introduction\n", + "\n", + "*Search* in Text-Fabric is a template based way of looking for structural patterns in your dataset.\n", + "\n", + "Within Text-Fabric we have the unique possibility to combine the ease of formulating search templates for\n", + "complicated syntactical patterns with the power of programmatically processing the results.\n", + "\n", + "This notebook will show you how to get up and running.\n", + "\n", + "## Easy command\n", + "\n", + "Search is as simple as saying (just an example)\n", + "\n", + "```python\n", + "results = A.search(template)\n", + "A.show(results)\n", + "```\n", + "\n", + "See all ins and outs in the\n", + "[search template docs](https://annotation.github.io/text-fabric/tf/about/searchusage.html)." + ] + }, + { + "cell_type": "markdown", + "id": "3f0597b0-6f7d-4610-91bb-6aa93a5c3f7a", + "metadata": {}, + "source": [ + "# Incantation\n", + "\n", + "The ins and outs of installing Text-Fabric, getting the corpus, and initializing a notebook are\n", + "explained in the [start tutorial](start.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b8d43d3f-d00a-4ec3-b690-d0fa6fc9dcbe", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "156b5da3-563a-4081-967b-afd74cc314a3", + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-24T10:06:39.818664Z", + "start_time": "2018-05-24T10:06:39.796588Z" + } + }, + "outputs": [], + "source": [ + "from tf.app import use" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d77aff2b-9f7d-45fb-a1a2-7d31c16c2bca", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "TF-app: ~/github/CLARIAH/descartes-tf/app" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "data: ~/github/CLARIAH/descartes-tf/tf/1.0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "data: ~/github/CLARIAH/descartes-tf/parallels/tf/1.0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is Text-Fabric 11.0.7\n", + "Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html\n", + "\n", + "28 features found and 0 ignored\n", + " 0.09s Dataset without structure sections in otext:no structure functions in the T-API\n", + " 0.35s All features loaded/computed - for details use TF.isLoaded()\n", + " 0.01s All additional features loaded - for details use TF.isLoaded()\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " Text-Fabric: Text-Fabric API 11.0.7, CLARIAH/descartes-tf/app v3, Search Reference
\n", + " Data: DESCARTES-TF, Character table, Feature docs
\n", + "
Node types\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
Name# of nodes# slots/node% coverage
volume885241.88100
letter725940.60100
page2884236.45100
postscriptum5646.790
opener5451.970
closer54113.101
address8615.220
head72523.372
p843880.82100
sentence1433245.7496
hi59724.634
formula62001.211
figure3191.000
word6819351.00100
\n", + " Sets: no custom sets
\n", + " Features:
\n", + "
Similar Sentences\n", + "
\n", + "\n", + "
\n", + "
\n", + "sim\n", + "
\n", + "
int
\n", + "\n", + " similarity between sentences based on the Levenshtein ratio\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
Descartes = Descartes, all letters\n", + "
\n", + "\n", + "
\n", + "
\n", + "alt_date\n", + "
\n", + "
str
\n", + "\n", + " alternative date of a letter\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "alt_id\n", + "
\n", + "
str
\n", + "\n", + " alternative ids of a letter, comma separated\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "cert\n", + "
\n", + "
str
\n", + "\n", + " certainty of something\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "date\n", + "
\n", + "
str
\n", + "\n", + " date of a letter\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "id\n", + "
\n", + "
str
\n", + "\n", + " id of a letter\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "intermediary\n", + "
\n", + "
str
\n", + "\n", + " person involved in the transmission of the letter from sender to receiver\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "isitalic\n", + "
\n", + "
str
\n", + "\n", + " whether the word is in italic\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "ismargin\n", + "
\n", + "
str
\n", + "\n", + " whether the word is in the margin\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "issub\n", + "
\n", + "
str
\n", + "\n", + " whether the word is in subscript\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "issup\n", + "
\n", + "
str
\n", + "\n", + " whether the word is in supscript\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "language\n", + "
\n", + "
str
\n", + "\n", + " language of a letter\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "level\n", + "
\n", + "
str
\n", + "\n", + " level of a paragraph when it acts like a heading\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "n\n", + "
\n", + "
int
\n", + "\n", + " number of whatever element\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "notation\n", + "
\n", + "
str
\n", + "\n", + " notation method of a formula\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "otype\n", + "
\n", + "
str
\n", + "\n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "punc\n", + "
\n", + "
str
\n", + "\n", + " nonword chars after a word \n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "recipient\n", + "
\n", + "
str
\n", + "\n", + " recipient of a letter\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "recipientloc\n", + "
\n", + "
str
\n", + "\n", + " location from where a letter was received\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "resp\n", + "
\n", + "
str
\n", + "\n", + " person responsible for something\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "sender\n", + "
\n", + "
str
\n", + "\n", + " sender of a letter\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "senderloc\n", + "
\n", + "
str
\n", + "\n", + " location from where a letter was sent\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "tex\n", + "
\n", + "
str
\n", + "\n", + " unformatted TeX code of a formula, without the `$`\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "trans\n", + "
\n", + "
str
\n", + "\n", + " transcription of a word \n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "typ\n", + "
\n", + "
str
\n", + "\n", + " kind of a node; \"empty\"; \"formula\", \"head\", \"symbol\", \"illustration\"\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "url\n", + "
\n", + "
str
\n", + "\n", + " url of a graphic node\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "oslots\n", + "
\n", + "
none
\n", + "\n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Text-Fabric API: names N F E L T S C TF directly usable

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "data: ~/github/CLARIAH/descartes-tf/source/illustrations" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Found 5 symbols
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Found 310 illustrations
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "A = use(\n", + " \"CLARIAH/descartes-tf:clone\",\n", + " checkout=\"clone\",\n", + " hoist=globals(),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "60350686-190b-4581-bb11-a5ccd2b21c83", + "metadata": {}, + "source": [ + "# Basic search command\n", + "\n", + "We start with the most simple form of issuing a query.\n", + "Let's look for the 16th sentence of the paragraphs that have that many sentences.\n", + "\n", + "Note that sentences are numbered within paragraphs and that the sentence number is in feature `n`." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b1586fae-955e-4043-84fa-f75320be7fc8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0.01s 13 results\n" + ] + } + ], + "source": [ + "template = \"\"\"\n", + "sentence n=16\n", + "\"\"\"\n", + "\n", + "results = A.search(template)" + ] + }, + { + "cell_type": "markdown", + "id": "eff1eed5-bd15-4264-8e32-8d1feacb440c", + "metadata": {}, + "source": [ + "We see the amount of results, but how do we get the results?" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0a163b37-4db0-440d-a838-5980932cfceb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(709073,),\n", + " (709215,),\n", + " (710310,),\n", + " (712826,),\n", + " (714388,),\n", + " (714650,),\n", + " (717861,),\n", + " (717913,),\n", + " (718273,),\n", + " (718766,),\n", + " (720933,),\n", + " (721641,),\n", + " (722538,)]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results" + ] + }, + { + "cell_type": "markdown", + "id": "f75dc1a4-c4e4-4052-b04e-cbc4bf918b5a", + "metadata": {}, + "source": [ + "Nice try. These are indeed the results, but they are just the nodes, i.e. meaningless numbers (to us).\n", + "\n", + "We get more flesh and blood by displaying the results." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "3fe1432d-2876-4245-a72a-53778c93dfe9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
npsentence
11 1027:3 Ce n'est pas que je ne l'aime et que je ne le tienne pour un homme tout plein d'honneur et de bonté; mais parce que je ne connais que deux personnes, avec qui il ait jamais eu quelque chose à démêler, qui sont M. Mydorge et M. Morin, et qu'il se plaint de tous les deux, je ne saurais que je ne juge qu'il tient quelque chose de cette humeur, il faut dire qu'il est bien malheureux.
21 1032:4 Quâ tamen in re non judico te satis prudenter cavere tuis rebus: quid enim si de istius manuscripti fide dubitatur? nunquid tutius esset testes adhibere vel tabulis publicis confirmare? Sed profecto, ut verum loquar, istae divitiae, quae fures timent et tantâ cum sollicitudine debent asservari, miserum te reddunt potius quàm beatum; nec, si mihi credis, te pigebit illa amittere simul cum morbo.
31 1116:3 Opinor autem quod, sicuti apud Poetam consessus Didonianus, conticebunt omnes intentique ora tenebunt. Precor autem te et obtestor ut eodem tenore caetera quae in manibus habes prosequaris et aliquando proferas, meque subinde epistolio tuo bees.
43 3174:23 Je m'étonne aussi de ce que, nonobstant que j'aie clairement démontré tout ce que j'ai dit devoir être corrigé en sa règle, et qu'il n'aît donné aucune raison à l'encontre, il ne laisse pas de dire que j'y ai mal réussi, au lieu de quoi je me persuade qu'il m'en devrait remercier; et même il ajoute que j'ai failli pour avoir dit qu'il fallait donner deux noms à la ligne qu'il nomme B etc., ce qui ne réussit, dit-il, qu'aux questions qui sont aisées, au lieu qu'il devrait dire que c'est donc lui-même qui avait failli, à cause que j'ai suivi en cela son texte de mot à mot, ainsi que j'ai faire pour le corriger.
53 3220:3 Je suis, Monsieur, Votre très obéissant et très obligé serviteur, DESCARTES. De Leyde le lundi au soir [12 décembre 1639]. Monsieur,
64 4230:3 (baillet, II, 21-22.)
76 6391:3 Maer niet-te-min dewijl al de Werelt oordeelt, dat hy de voornaemste autheur is vande lasteringhen die in het ghemelt fameux boeck teghens my worden ghevonden, versoeck ick U. Ed.
86 6396:3 Je vous assure qu'elles ne me touchent guère, et ne m'ont point emmaigri, comme Voetius, à qui on dit qu'elles ont ôté treize livres de chair, mais non pas de graisse, à cause qu'il n'en eut jamais tant.
96 6425:9 Quin etiam nullum ea de re scriptum peculiare composui, sed obiter tantum in epistola in qua de Patre quodam Societatis conquerebar, et quam tunc commodam sub praelo habebam, paucas de illo paginas inserui.
106 6470:4 C'est un avantage qu'a eu aussi dans la suite la version française des Principes de M. Descartes, faite par l'Abbé Picot.
117 7601:6 Lorsque l'écriture sainte parle en divers endroits de la multitude innombrable des Anges, elle confirme entièrement cette opinion: car nous jugeons que les moindres Anges sont incomparablement plus parfaits que les hommes.
128 8650:3 Cependant je puis vous assurer que cette Princesse, qui n'estime rien au monde que la vérité et la vertu, fait un grand jugement de vous pour l'amour de l'une et de l'autre.
138 8689:4 préfac.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "A.table(results)" + ] + }, + { + "cell_type": "markdown", + "id": "9976d9ba-be60-42ab-90f4-0d0122352007", + "metadata": {}, + "source": [ + "## Figures\n", + "\n", + "Let's look for all paragraphs with an illustration in it." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "4269b0ee-87fe-46e1-bd99-3e62dd64d692", + "metadata": {}, + "outputs": [], + "source": [ + "query = \"\"\"\n", + "p\n", + " figure\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d791e8e9-9dd6-4443-bd6f-75a225f74cdb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0.00s 319 results\n" + ] + } + ], + "source": [ + "results = A.search(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "99de2119-7e2b-4f5b-b18a-eb04b9aff5bf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
nppfigure
11 1001:4
21 1002:3
31 1002:3
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "A.table(results, end=3)" + ] + }, + { + "cell_type": "markdown", + "id": "2d6d6313-88a1-452f-9323-e977013d0d5a", + "metadata": {}, + "source": [ + "The results are shown inside the sentences that they occur in.\n", + "`p`s are too big to fit into sentences, so the `p`s are left out and only the images show up.\n", + "\n", + "We can make the display richer: instead of a plain table, we can unfold the sentences in the results:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4d5f1536-4194-4c9d-86b1-a3dbd44580db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

result 1" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

1 1001:4
p 4
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
1 1001:4
sentence 2
Unum
autem
est,
quod,
opinor,
non
satis
meditate
scripsisti:
nempe
omnes
saltus
in
unicâ
voce
fieri
figure AT10-152a.gif
per
consonantias
exactas.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

result 2" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

1 1002:3
p 3
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
1 1002:3
sentence 4
Atque
hac
arte
quadruplo
plures
quaestiones
et
longe
difficiliores
solvi
poterunt,
quàm
communi
Algebrâ;
13
enim
diversa
genera
aequationum
cubicarum
numero,
qualia
tantùm
sunt
tria
aequationum
communium:
nempe
inter
formula
1
figure cossic2.png
et
formula
O
figure cossic1.png
+ ON
,
vel
formula
O
figure cossic1.png
− ON
,\n", + "
vel
denique
formula
ON − O
figure cossic1.png
.
Aliud
est
quod
jam
quaero
de
radi
cibus
simul
ex
pluribus
variis
nominibus
compositis
extrahendis;
quod
si
reperero,
ut
spero,
scientiam
illam
plane
digeram
in
ordinem,
si
desidiam
innatam
possim
vincere,
et
fata
liberam
vitam
indulgeant.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

result 3" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

1 1002:3
p 3
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
1 1002:3
sentence 4
Atque
hac
arte
quadruplo
plures
quaestiones
et
longe
difficiliores
solvi
poterunt,
quàm
communi
Algebrâ;
13
enim
diversa
genera
aequationum
cubicarum
numero,
qualia
tantùm
sunt
tria
aequationum
communium:
nempe
inter
formula
1
figure cossic2.png
et
formula
O
figure cossic1.png
+ ON
,
vel
formula
O
figure cossic1.png
− ON
,\n", + "
vel
denique
formula
ON − O
figure cossic1.png
.
Aliud
est
quod
jam
quaero
de
radi
cibus
simul
ex
pluribus
variis
nominibus
compositis
extrahendis;
quod
si
reperero,
ut
spero,
scientiam
illam
plane
digeram
in
ordinem,
si
desidiam
innatam
possim
vincere,
et
fata
liberam
vitam
indulgeant.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "A.show(results, end=3)" + ] + }, + { + "cell_type": "markdown", + "id": "194d0133-3fba-4926-b56b-a033295941b8", + "metadata": {}, + "source": [ + "The results are collected and shown in their surrounding sentence. \n", + "\n", + "Not that we see only the sentences that contain an image.\n", + "\n", + "But we can see more if we tell text-fabric to condense the result not in sentences, but in `p`s:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "9f95f839-91c4-41a3-9fc8-f8d62854f1dc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

result 1" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

1 1001:4
p 4
sentence 1
Quod
ad
tuam
quaestionem
spectat,
ipse
solvis,
nec
melius
potest.
sentence 2
Unum
autem
est,
quod,
opinor,
non
satis
meditate
scripsisti:
nempe
omnes
saltus
in
unicâ
voce
fieri
figure AT10-152a.gif
per
consonantias
exactas.
sentence 3
Distet
enim
nota
formula
A
à
notâ
formula
D
intervallo
unius
quintae:
necessariò
distabit
à
formula
C
spatio
unius
quartae,
non
perfectae,
sed
quae
deficiat
uno
schismate,
ut
demonstratur
ex
numeris
appositis;
quibus
si
utaris,
facillime
cujuslibet
toni
exactam
quantitatem
invenies.
sentence 4
Neque
dixeris
debere
potius
inter
formula
A
et
formula
D
esse
quintam
imperfectam,
ut
formula
AC
sit
vera
quarta
et
exacta;
melius
enim
dissonantia
adverteretur
in
tonis
qui
simul
emitti
debent,
quàm
in
iis
qui
successive.
sentence 5
Quos
existimo,
saltem
in
vocali
musicâ
et
mathematice
eleganti,
nunquam
ab
uno
consonantiae
termino
ad
alium
immediate
pervenire,
sed
vehi
suaviter
per
omne
medium
intervallum;
quod
impedit
ne
unius
schismatis
exiguus
error
distinguatur.
sentence 6
Idque
me
notasse
memini
in
iis,
quae
de
dissonantiis
ante
scripsi;
ad
quae
si
diligenter
advertas
et
ad
reliquam
meam
Musicam,
invenies
omnia
quae
de
consonantiarum,
graduum,
et
dissonantiarum
intervallis
annotavi,
mathematice
demonstrari,
sed
indigeste
et
confuse
nimiumque
breviter
explicata.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

result 2" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

1 1002:3
p 3
sentence 1
Prima
est
celeberrima
de
dividendo
angulo
in
aequales
partes
quotlibet.
sentence 2
Tres
aliae
pertinent
ad
aequationes
cub(
ic)
as:
quarum
primum
genus
est
inter
numerum
absolutum,
radices,
et
cubos;
alterum,
inter
numerum
absolutum,
quadrata,
et
cubos;
tertium
denique,
inter
numerum
absolutum,
radices,
quadrata,
et
cubos.
sentence 3
Pro
quibus
3
demonstrationes
repperi,
quarum
unaquaeque
ad
varia
membra
est
extendenda
propter
varietatem
signorum +
et -.
Quae
omnia
nondum
discussi;
sed
facile,
meo
judicio,
quod
in
unis
repperi
ad
alia
applicabo.
sentence 4
Atque
hac
arte
quadruplo
plures
quaestiones
et
longe
difficiliores
solvi
poterunt,
quàm
communi
Algebrâ;
13
enim
diversa
genera
aequationum
cubicarum
numero,
qualia
tantùm
sunt
tria
aequationum
communium:
nempe
inter
formula
1
figure cossic2.png
et
formula
O
figure cossic1.png
+ ON
,
vel
formula
O
figure cossic1.png
− ON
,\n", + "
vel
denique
formula
ON − O
figure cossic1.png
.
Aliud
est
quod
jam
quaero
de
radi
cibus
simul
ex
pluribus
variis
nominibus
compositis
extrahendis;
quod
si
reperero,
ut
spero,
scientiam
illam
plane
digeram
in
ordinem,
si
desidiam
innatam
possim
vincere,
et
fata
liberam
vitam
indulgeant.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

result 3" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

1 1002:3
p 3
sentence 1
Prima
est
celeberrima
de
dividendo
angulo
in
aequales
partes
quotlibet.
sentence 2
Tres
aliae
pertinent
ad
aequationes
cub(
ic)
as:
quarum
primum
genus
est
inter
numerum
absolutum,
radices,
et
cubos;
alterum,
inter
numerum
absolutum,
quadrata,
et
cubos;
tertium
denique,
inter
numerum
absolutum,
radices,
quadrata,
et
cubos.
sentence 3
Pro
quibus
3
demonstrationes
repperi,
quarum
unaquaeque
ad
varia
membra
est
extendenda
propter
varietatem
signorum +
et -.
Quae
omnia
nondum
discussi;
sed
facile,
meo
judicio,
quod
in
unis
repperi
ad
alia
applicabo.
sentence 4
Atque
hac
arte
quadruplo
plures
quaestiones
et
longe
difficiliores
solvi
poterunt,
quàm
communi
Algebrâ;
13
enim
diversa
genera
aequationum
cubicarum
numero,
qualia
tantùm
sunt
tria
aequationum
communium:
nempe
inter
formula
1
figure cossic2.png
et
formula
O
figure cossic1.png
+ ON
,
vel
formula
O
figure cossic1.png
− ON
,\n", + "
vel
denique
formula
ON − O
figure cossic1.png
.
Aliud
est
quod
jam
quaero
de
radi
cibus
simul
ex
pluribus
variis
nominibus
compositis
extrahendis;
quod
si
reperero,
ut
spero,
scientiam
illam
plane
digeram
in
ordinem,
si
desidiam
innatam
possim
vincere,
et
fata
liberam
vitam
indulgeant.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "A.show(results, end=3, condenseType=\"p\")" + ] + }, + { + "cell_type": "markdown", + "id": "6f4def1a-f87a-4f82-a537-0e3fadbcd6a1", + "metadata": {}, + "source": [ + "# Formulas\n", + "\n", + "Now let's look for formulas that have a square root in them.\n", + "\n", + "Note that in TeX a square root is written as `\\sqrt`.\n", + "\n", + "The TeX source of a formula is contained in the `tex` feature of a formula node, provided\n", + "the formula is written in TeX. Not all formulas are written in TeX." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b9a91170-0cd7-4ecc-b6cb-5b4fb08f7f05", + "metadata": {}, + "outputs": [], + "source": [ + "query = \"\"\"\n", + "formula tex~sqrt\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "0c638c11-2064-4c91-b359-81ac6562b251", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0.00s 54 results\n" + ] + } + ], + "source": [ + "results = A.search(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "331153b5-2783-4b65-a161-28f157a0382b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

sentence 1" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

2 2152:4
sentence 10
Or
par
cette
seule
équation
de
la
page
326,
à
savoir:
formula
y
formula
figure propto.png
formula TeX
tex={\\rm m} - {{\\displaystyle\\strut {\\rm n}\\over \\displaystyle\\strut {\\rm z}}}{\\rm x} + \\sqrt{{\\rm mm} + {\\rm ox} - {{\\displaystyle\\strut {\\rm p}\\over \\displaystyle\\strut {\\rm m}}}{\\rm xx}}
${\\rm m} - {{\\displaystyle\\strut {\\rm n}\\over \\displaystyle\\strut {\\rm z}}}{\\rm x} + \\sqrt{{\\rm mm} + {\\rm ox} - {{\\displaystyle\\strut {\\rm p}\\over \\displaystyle\\strut {\\rm m}}}{\\rm xx}}$
,$\n", + "
en
changeant
seulement
les
marques +
et -,
ou
supposant
quelques
termes
pour
nuls,
je
comprends
toutes
celles
qui
peuvent
se
rapporter
à
quelque
lieu
plan
ou
solide.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

sentence 2" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

3 3174:14
sentence 2
Je
cherche
la
tangente
formula
FE
ou
formula
CB
parallèle
au
diamètre
formula
AK
,
figure AT2-313a.gif
posant
que
la
propriété
de
cette
courbe
est
telle
que,
menant
formula
FG
à
angles
droits
sur
formula
AH
,
l'
agrégat
des
cubes
de
formula
FG
et
formula
AG
est
égal
au
parallélipipède
des
mêmes
formula
FG
et
formula
AG
,
et
d'
une
autre
ligne
donnée
qui
est
double
d'
formula
AH
.\n", + "
Et
je
fais
formula
AG = x
,
formula
GF = y
et
le
double
d'
formula
AH =
hi
formula
n
d'
j'
ai
hi
formula
x
hi
formula
3
formula
+
hi
formula
y
hi
formula
3
formula
=
hi
formula
xyn
Puis
je
fais
formula
AE =
hi
v
de
façon
que
formula
EG
est
hi
formula
x
formula
hi
formula
v
et
parce
que
l'
angle
formula
EFG
est
de
45
degrés,
formula
GF
est
aussi
hi
formula
x
formula
hi
formula
v
ce
que
je
substitue,
au
lieu
d'
hi
formula
y
,
en
l'
équation
précédente,
et
au
lieu
de
hi
formula
y
hi
formula
3
je
substitue
son
cube,
qui
est
hi
formula
x
hi
formula
3
formula
− 3
hi
formula
vxx
formula
+ 3
hi
formula
vvx
formula
hi
formula
v
hi
formula
3
si
bien
que
j'
ai
pour
mon
équation
formula
2
hi
formula
x
hi
formula
3
formula
− 3
hi
formula
vxx
formula
+ 3
hi
formula
vvx
formula
hi
formula
v
hi
formula
3
formula
=
hi
formula
nxx
formula
hi
formula
nvx
ce
que
je
compare
avec
hi
formula
xx
formula
− 2
hi
formula
ex
formula
+
hi
formula
ee
formula
= 0
multiplié
par
formula
2
hi
formula
x
formula
− 2
hi
formula
f
formula
= 0
et
j'
ai
formula
2
hi
formula
x
hi
formula
3
formula
− 4
hi
formula
exx
formula
+ 2
hi
formula
eex
formula
\n", + "− 2
hi
formula
fxx
formula
+ 4
hi
formula
efx
formula
− 2
hi
formula
eef
formula
= 0
de
même
forme
que
formula
2
hi
formula
x
hi
formula
3
formula
− 3
hi
formula
vxx
formula
+ 3
hi
formula
vvx
formula
hi
formula
v
hi
formula
3
formula
= 0 \n", + "−
hi
formula
nxx
formula
+
hi
formula
nvx
. \n", + "
Et
les
termes
multipliés
par
hi
formula
xx
me
donnent
formula
2
hi
formula
f
formula
= 3
hi
formula
v
formula
+
hi
formula
n
formula
− 4
hi
formula
e
Puis
les
termes
multipliés
par
hi
formula
x
me
donnent
formula
6
hi
formula
ev
formula
+ 2
hi
formula
en
formula
− 6
hi
formula
ee
formula
= 3
hi
formula
vv
formula
+
hi
formula
nv
ou
bien
formula TeX
tex=vv = - {{1\\over 3}}nv + 2ev + {{2\\over 3}}ne - 2ee
$vv = - {{1\\over 3}}nv + 2ev + {{2\\over 3}}ne - 2ee$
c'
est-
à-
dire
à
cause
que
hi
formula
e
est
égal
à
hi
formula
x
,
que
hi
formula
v
est
formula TeX
tex=x - {{1\\over 6}}n \\pm \\sqrt{{{1\\over 36}}nn + {{1\\over 3}}nx - xx}
$x - {{1\\over 6}}n \\pm \\sqrt{{{1\\over 36}}nn + {{1\\over 3}}nx - xx}$
. \n", + "
Ce
qui
déterminerait
entièrement
la
tangente
cherchée,
si
la
quantité
hi
formula
x
était
connue;
mais
parce
qu'
elle
ne
l'
est
pas,
il
faut
poursuivre
en
cette
sorte.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

sentence 3" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

3 3174:15
sentence 1
Puisque
hi
formula
y
est
égal
à
hi
formula
x
formula
hi
formula
v
et
que
hi
formula
v
vient
d'
être
trouvé,
nous
avons
aussi
formula TeX
tex=y = {{1\\over 6}}n \\pm \\sqrt{{{1\\over 36}}nn + {{1\\over 3}}nx - xx}
$y = {{1\\over 6}}n \\pm \\sqrt{{{1\\over 36}}nn + {{1\\over 3}}nx - xx}$
ce
qui
étant
substitué
au
lieu
d'
hi
formula
y
,
et
son
cube
au
lieu
d'
hi
formula
y
hi
formula
3
en
la
première
équation,
on
trouve
en
la
démêlant
qu'
elle
se
réduit
à
ces
termes:
formula TeX
tex=x^{4}.. - {{1\\over 9}}n^{3}x + {{1\\over 54}}n^{4} = 0
$x^{4}.. - {{1\\over 9}}n^{3}x + {{1\\over 54}}n^{4} = 0$
. \n", + "
Et
par
la
règle
qui
est
en
ma
hi
Géométrie,
page
383,
j'
écris
en
leur
place:
formula TeX
tex=z^{6}. - {{2\\over 27}}n^{4}zz - {{1\\over 81}}n^{6} = 0
$z^{6}. - {{2\\over 27}}n^{4}zz - {{1\\over 81}}n^{6} = 0$
. \n", + "
Puis (
par
la
page
381)
je
trouve
la
valeur
de
hi
formula
zz
,
qui
est
formula TeX
tex={{1\\over 3}}nn
${{1\\over 3}}nn$
et
formula TeX
tex=z = n\\sqrt{{{1\\over 3}}}
$z = n\\sqrt{{{1\\over 3}}}$
Au
moyen
de
quoi (
par
la
page
385),
je
divise
l'
équation
formula TeX
tex=x^{4} - {{1\\over 9}}n^{3}x + {{1\\over 54}}n^{4} = 0
$x^{4} - {{1\\over 9}}n^{3}x + {{1\\over 54}}n^{4} = 0$
en
deux
autres
qui
sont
formula TeX
tex=xx - nx\\sqrt{{{1\\over 3}} + {{1\\over 6}}nn} - {\\displaystyle\\strut {nn}\\over \\displaystyle\\strut {6\\sqrt{ 3}}} = 0
$xx - nx\\sqrt{{{1\\over 3}} + {{1\\over 6}}nn} - {\\displaystyle\\strut {nn}\\over \\displaystyle\\strut {6\\sqrt{ 3}}} = 0$
et
formula TeX
tex=xx + nx\\sqrt{{{1\\over 3}} + {{1\\over 6}}nn} + {\\displaystyle\\strut {nn}\\over \\displaystyle\\strut {6\\sqrt{ 3}}} = 0
$xx + nx\\sqrt{{{1\\over 3}} + {{1\\over 6}}nn} + {\\displaystyle\\strut {nn}\\over \\displaystyle\\strut {6\\sqrt{ 3}}} = 0$
. \n", + "
Et
par
la
première
de
ces
deux
équations,
je
connais
la
valeur
d'
hi
formula
x
,
qui
est
formula TeX
tex=x = n\\sqrt{{{1\\over 12}}} \\pm \\sqrt{{\\displaystyle\\strut {nn}\\over \\displaystyle\\strut {6\\sqrt{3}}} - {{1\\over 12}}nn}
$x = n\\sqrt{{{1\\over 12}}} \\pm \\sqrt{{\\displaystyle\\strut {nn}\\over \\displaystyle\\strut {6\\sqrt{3}}} - {{1\\over 12}}nn}$
. \n", + "
Enfin,
à
cause
que,
cherchant
en
même
façon
la
ligne
formula
AB
par
la
tangente
formula
CB
,
il
vient
une
équation
toute
semblable,
on
apprend
de
que
la
ligne
formula
AG
est
formula TeX
tex=n\\sqrt{{{1\\over 12}}} + \\sqrt{{\\displaystyle\\strut {nn}\\over \\displaystyle\\strut {6\\sqrt{3}}} - {{1\\over 12}}nn}
$n\\sqrt{{{1\\over 12}}} + \\sqrt{{\\displaystyle\\strut {nn}\\over \\displaystyle\\strut {6\\sqrt{3}}} - {{1\\over 12}}nn}$
et
que
formula
AD
est
formula TeX
tex=n\\sqrt{{{1\\over 12}}} - \\sqrt{{\\displaystyle\\strut {nn}\\over \\displaystyle\\strut {6\\sqrt{3}}} - {{1\\over 12}}nn}
$n\\sqrt{{{1\\over 12}}} - \\sqrt{{\\displaystyle\\strut {nn}\\over \\displaystyle\\strut {6\\sqrt{3}}} - {{1\\over 12}}nn}$
et
par
conséquent
que
formula
DG
est
formula TeX
tex=\\sqrt{{\\displaystyle\\strut {2nn}\\over \\displaystyle\\strut {3\\sqrt{3}}} - {{1\\over 3}}nn}
$\\sqrt{{\\displaystyle\\strut {2nn}\\over \\displaystyle\\strut {3\\sqrt{3}}} - {{1\\over 3}}nn}$
et
que
formula
CF
est
formula TeX
tex=\\sqrt{{\\displaystyle\\strut {4nn}\\over \\displaystyle\\strut {3\\sqrt{3}}} - {{2\\over 3}}nn}
$\\sqrt{{\\displaystyle\\strut {4nn}\\over \\displaystyle\\strut {3\\sqrt{3}}} - {{2\\over 3}}nn}$
. \n", + "
Ce
qui
est
la
plus
grande
largeur
de
la
feuille
qu'
on
demandait,
en
sorte
que,
si
la
ligne
hi
formula
n
est
9,
formula
CF
sera
formula TeX
tex=\\sqrt{36\\sqrt{3} - 54}
$\\sqrt{36\\sqrt{3} - 54}$
et
si
hi
formula
n
est
3,
formula
CF
sera
formula TeX
tex=\\sqrt{4\\sqrt{3} - 6}
$\\sqrt{4\\sqrt{3} - 6}$
et
ainsi
des
autres.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "A.show(results, end=3, condensed=True)" + ] + }, + { + "cell_type": "markdown", + "id": "9a70cab4-9316-46cd-beb5-66edd63f4a58", + "metadata": {}, + "source": [ + "We can get rid of the TeX codes.\n", + "\n", + "We see them because our query mentioned the feature `tex`, but we can turn that off (showing the 3rd result only)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "028d28df-ff81-4e47-b904-974749c69ee2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

sentence 3" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

3 3174:15
sentence 1
Puisque
hi
formula
y
est
égal
à
hi
formula
x
formula
hi
formula
v
et
que
hi
formula
v
vient
d'
être
trouvé,
nous
avons
aussi
formula TeX
$y = {{1\\over 6}}n \\pm \\sqrt{{{1\\over 36}}nn + {{1\\over 3}}nx - xx}$
ce
qui
étant
substitué
au
lieu
d'
hi
formula
y
,
et
son
cube
au
lieu
d'
hi
formula
y
hi
formula
3
en
la
première
équation,
on
trouve
en
la
démêlant
qu'
elle
se
réduit
à
ces
termes:
formula TeX
$x^{4}.. - {{1\\over 9}}n^{3}x + {{1\\over 54}}n^{4} = 0$
. \n", + "
Et
par
la
règle
qui
est
en
ma
hi
Géométrie,
page
383,
j'
écris
en
leur
place:
formula TeX
$z^{6}. - {{2\\over 27}}n^{4}zz - {{1\\over 81}}n^{6} = 0$
. \n", + "
Puis (
par
la
page
381)
je
trouve
la
valeur
de
hi
formula
zz
,
qui
est
formula TeX
${{1\\over 3}}nn$
et
formula TeX
$z = n\\sqrt{{{1\\over 3}}}$
Au
moyen
de
quoi (
par
la
page
385),
je
divise
l'
équation
formula TeX
$x^{4} - {{1\\over 9}}n^{3}x + {{1\\over 54}}n^{4} = 0$
en
deux
autres
qui
sont
formula TeX
$xx - nx\\sqrt{{{1\\over 3}} + {{1\\over 6}}nn} - {\\displaystyle\\strut {nn}\\over \\displaystyle\\strut {6\\sqrt{ 3}}} = 0$
et
formula TeX
$xx + nx\\sqrt{{{1\\over 3}} + {{1\\over 6}}nn} + {\\displaystyle\\strut {nn}\\over \\displaystyle\\strut {6\\sqrt{ 3}}} = 0$
. \n", + "
Et
par
la
première
de
ces
deux
équations,
je
connais
la
valeur
d'
hi
formula
x
,
qui
est
formula TeX
$x = n\\sqrt{{{1\\over 12}}} \\pm \\sqrt{{\\displaystyle\\strut {nn}\\over \\displaystyle\\strut {6\\sqrt{3}}} - {{1\\over 12}}nn}$
. \n", + "
Enfin,
à
cause
que,
cherchant
en
même
façon
la
ligne
formula
AB
par
la
tangente
formula
CB
,
il
vient
une
équation
toute
semblable,
on
apprend
de
que
la
ligne
formula
AG
est
formula TeX
$n\\sqrt{{{1\\over 12}}} + \\sqrt{{\\displaystyle\\strut {nn}\\over \\displaystyle\\strut {6\\sqrt{3}}} - {{1\\over 12}}nn}$
et
que
formula
AD
est
formula TeX
$n\\sqrt{{{1\\over 12}}} - \\sqrt{{\\displaystyle\\strut {nn}\\over \\displaystyle\\strut {6\\sqrt{3}}} - {{1\\over 12}}nn}$
et
par
conséquent
que
formula
DG
est
formula TeX
$\\sqrt{{\\displaystyle\\strut {2nn}\\over \\displaystyle\\strut {3\\sqrt{3}}} - {{1\\over 3}}nn}$
et
que
formula
CF
est
formula TeX
$\\sqrt{{\\displaystyle\\strut {4nn}\\over \\displaystyle\\strut {3\\sqrt{3}}} - {{2\\over 3}}nn}$
. \n", + "
Ce
qui
est
la
plus
grande
largeur
de
la
feuille
qu'
on
demandait,
en
sorte
que,
si
la
ligne
hi
formula
n
est
9,
formula
CF
sera
formula TeX
$\\sqrt{36\\sqrt{3} - 54}$
et
si
hi
formula
n
est
3,
formula
CF
sera
formula TeX
$\\sqrt{4\\sqrt{3} - 6}$
et
ainsi
des
autres.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "A.show(results, start=3, end=3, condensed=True, queryFeatures=False)" + ] + }, + { + "cell_type": "markdown", + "id": "5d89b91b-dc66-4cc8-aec7-43e4a2765164", + "metadata": {}, + "source": [ + "## Formulas without TeX\n", + "\n", + "We gather the formulas not written in TeX:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "398e1c5f-01f2-41d8-9317-ab73dd1f4ce1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0.01s 5981 results\n" + ] + } + ], + "source": [ + "query = \"\"\"\n", + "formula notation#TeX\n", + "\"\"\"\n", + "\n", + "results = A.search(query)" + ] + }, + { + "cell_type": "markdown", + "id": "6df9780c-8aef-40ff-b474-7939e7bc6425", + "metadata": {}, + "source": [ + "The majority is not written in TeX, let's sample a few:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "90250c53-9231-4a76-87ab-94c2df23284a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
npformula
17 7538:6 IN
22 2122:12 G
31 1020:7 ZZ
42 2164:25 GR
52 2159:5 g
62 2156:9 C in A + C in E Aq A in E bis Eq
72 2126:16 AO
81 1f1b:3 DE
97 7547:12 S
104 4303:13 BG
111 1063:9 B
126 6408:5 x
133 3198:15 FL
141 1020:12 Q
151 1020:10 LD
161 1066:7 B
172 2156:6 DN
182 2156:15 EO
194 4289:9 AC
206 6467:5 g
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from random import seed, sample\n", + "\n", + "seed(42)\n", + "\n", + "selected = sample(results, 20)\n", + "\n", + "A.table(selected)" + ] + }, + { + "cell_type": "markdown", + "id": "aab833a7-aaaa-49ee-bb23-06712544bc17", + "metadata": {}, + "source": [ + "These formulas are all so simple that TeX was not needed to display them.\n", + "\n", + "Let's see the first 2 of them in context:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "e3c73134-887d-4640-b138-46e08b7936db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

sentence 1" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

2 2122:12
sentence 1
Au
reste,
il
faut
remarquer
que
ce
n'
est
point
la
poulie
qui
cause
cette
force,
mais
seulement
le
redoublement
de
la
corde:
car
si
on
attache
encore
une
poulie
vers
formula
A
,
par
laquelle
on
passe
la
corde
formula
ABCH
,
il
ne
faudra
pas
moins
de
force
pour
tirer
formula
H
vers
formula
K
,
et
ainsi
lever
le
poids
formula
E
,
qu'
il
en
fallait
auparavant
pour
tirer
formula
C
vers
formula
G
.
Mais,
si
à
ces
deux
poulies
on
en
ajoute
encore
une
autre
vers
formula
D
,
à
laquelle
on
attache
le
poids
et
dans
laquelle
on
passe
la
corde
tout
de
même
qu'
en
la
première,
alors
on
n'
aura
pas
besoin
de
plus
de
force
pour
lever
ce
poids
de
200
livres,
que
pour
en
lever
un
de
50
sans
poulie,
à
cause
qu'
en
tirant
4
pieds
de
la
corde
on
ne
l'
élèvera
que
d'
un
pied.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

sentence 2" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

7 7538:6
sentence 5
Il
en
sera
de
même
de
tous
les
points
éloignés
également
de
part
et
d'
autre
du
point
formula
N
.\n", + "
Enfin,
par
les
lignes
formula
AB
et
formula
IN
,
soit
mené
un
plan
formula
ABHG
,\n", + "
qui
coupera
le
secteur
formula
AH
en
deux
autres
secteurs
égaux,
et
formera
le
rectangle
formula
ABHG
,
duquel
les
côtés
formula
AG
et
formula
BH
couperont
aussi
en
deux
également
les
secteurs
de
cercles
formula
ACGE
et
formula
BDHF
,
et
par
les
points
formula
G
,
formula
N
,
formula
H
soient
menées
des
lignes
droites
qui
touchent
les
arcs
formula
CE
,
formula
LM
,
formula
DF
,
lesquelles
touchantes
soient
formula
ZG 4
,
formula
XNY
,
et
formula
6 H 7
,
qui
seront
perpendiculaires
aux
demi-
diamètres
formula
AG
,
formula
IN
,
formula
BH
.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "A.show(selected[0:2], condensed=True)" + ] + }, + { + "cell_type": "markdown", + "id": "09577595-dbb0-4322-a61f-1615613325d3", + "metadata": { + "tags": [] + }, + "source": [ + "---\n", + "\n", + "# Contents\n", + "\n", + "* **[start](start.ipynb)** intro and highlights\n", + "* **search** turbo charge your hand-coding with search templates\n", + "* **[compute](compute.ipynb)** sink down a level and compute it yourself\n", + "* **[exportExcel](exportExcel.ipynb)** make tailor-made spreadsheets out of your results\n", + "\n", + "Advanced\n", + "\n", + "* **[similar sentences](similar.ipynb)** find similar sentences\n", + "\n", + "CC-BY Dirk Roorda" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorial/similar.ipynb b/tutorial/similar.ipynb new file mode 100644 index 0000000..a88232c --- /dev/null +++ b/tutorial/similar.ipynb @@ -0,0 +1,1916 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4a486875-147d-492a-9003-f05c48d841fc", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "---\n", + "\n", + "To get started: consult [start](start.ipynb)\n", + "\n", + "---\n", + "\n", + "# Similar sentences\n", + "\n", + "We explore the similar sentences in the letters of Descartes.\n", + "\n", + "They have already been diagnosed and put in an *edge* feature by running\n", + "the notebook [parallels](../programs/parallels.ipynb)." + ] + }, + { + "cell_type": "markdown", + "id": "3f0597b0-6f7d-4610-91bb-6aa93a5c3f7a", + "metadata": {}, + "source": [ + "# Incantation\n", + "\n", + "The ins and outs of installing Text-Fabric, getting the corpus, and initializing a notebook are\n", + "explained in the [start tutorial](start.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b8d43d3f-d00a-4ec3-b690-d0fa6fc9dcbe", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "156b5da3-563a-4081-967b-afd74cc314a3", + "metadata": { + "ExecuteTime": { + "end_time": "2018-05-24T10:06:39.818664Z", + "start_time": "2018-05-24T10:06:39.796588Z" + } + }, + "outputs": [], + "source": [ + "from tf.app import use" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d77aff2b-9f7d-45fb-a1a2-7d31c16c2bca", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "TF-app: ~/github/CLARIAH/descartes-tf/app" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "data: ~/github/CLARIAH/descartes-tf/tf/1.0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "data: ~/github/CLARIAH/descartes-tf/parallels/tf/1.0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is Text-Fabric 11.0.7\n", + "Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html\n", + "\n", + "28 features found and 0 ignored\n", + " 0.09s Dataset without structure sections in otext:no structure functions in the T-API\n", + " 0.34s All features loaded/computed - for details use TF.isLoaded()\n", + " 0.01s All additional features loaded - for details use TF.isLoaded()\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " Text-Fabric: Text-Fabric API 11.0.7, CLARIAH/descartes-tf/app v3, Search Reference
\n", + " Data: DESCARTES-TF, Character table, Feature docs
\n", + "
Node types\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
Name# of nodes# slots/node% coverage
volume885241.88100
letter725940.60100
page2884236.45100
postscriptum5646.790
opener5451.970
closer54113.101
address8615.220
head72523.372
p843880.82100
sentence1433245.7496
hi59724.634
formula62001.211
figure3191.000
word6819351.00100
\n", + " Sets: no custom sets
\n", + " Features:
\n", + "
Similar Sentences\n", + "
\n", + "\n", + "
\n", + "
\n", + "sim\n", + "
\n", + "
int
\n", + "\n", + " similarity between sentences based on the Levenshtein ratio\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
Descartes = Descartes, all letters\n", + "
\n", + "\n", + "
\n", + "
\n", + "alt_date\n", + "
\n", + "
str
\n", + "\n", + " alternative date of a letter\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "alt_id\n", + "
\n", + "
str
\n", + "\n", + " alternative ids of a letter, comma separated\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "cert\n", + "
\n", + "
str
\n", + "\n", + " certainty of something\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "date\n", + "
\n", + "
str
\n", + "\n", + " date of a letter\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "id\n", + "
\n", + "
str
\n", + "\n", + " id of a letter\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "intermediary\n", + "
\n", + "
str
\n", + "\n", + " person involved in the transmission of the letter from sender to receiver\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "isitalic\n", + "
\n", + "
str
\n", + "\n", + " whether the word is in italic\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "ismargin\n", + "
\n", + "
str
\n", + "\n", + " whether the word is in the margin\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "issub\n", + "
\n", + "
str
\n", + "\n", + " whether the word is in subscript\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "issup\n", + "
\n", + "
str
\n", + "\n", + " whether the word is in supscript\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "language\n", + "
\n", + "
str
\n", + "\n", + " language of a letter\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "level\n", + "
\n", + "
str
\n", + "\n", + " level of a paragraph when it acts like a heading\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "n\n", + "
\n", + "
int
\n", + "\n", + " number of whatever element\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "notation\n", + "
\n", + "
str
\n", + "\n", + " notation method of a formula\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "otype\n", + "
\n", + "
str
\n", + "\n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "punc\n", + "
\n", + "
str
\n", + "\n", + " nonword chars after a word \n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "recipient\n", + "
\n", + "
str
\n", + "\n", + " recipient of a letter\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "recipientloc\n", + "
\n", + "
str
\n", + "\n", + " location from where a letter was received\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "resp\n", + "
\n", + "
str
\n", + "\n", + " person responsible for something\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "sender\n", + "
\n", + "
str
\n", + "\n", + " sender of a letter\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "senderloc\n", + "
\n", + "
str
\n", + "\n", + " location from where a letter was sent\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "tex\n", + "
\n", + "
str
\n", + "\n", + " unformatted TeX code of a formula, without the `$`\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "trans\n", + "
\n", + "
str
\n", + "\n", + " transcription of a word \n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "typ\n", + "
\n", + "
str
\n", + "\n", + " kind of a node; \"empty\"; \"formula\", \"head\", \"symbol\", \"illustration\"\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "url\n", + "
\n", + "
str
\n", + "\n", + " url of a graphic node\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "oslots\n", + "
\n", + "
none
\n", + "\n", + " \n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Text-Fabric API: names N F E L T S C TF directly usable

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "data: ~/github/CLARIAH/descartes-tf/source/illustrations" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Found 5 symbols
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Found 310 illustrations
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "A = use(\n", + " \"CLARIAH/descartes-tf:clone\",\n", + " checkout=\"clone\",\n", + " hoist=globals(),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f100a88e-ae13-4921-abb8-6bf5ee732af2", + "metadata": {}, + "source": [ + "# Use the similar sentences module\n", + "\n", + "You see an extra module **Similar Sentences** listed with one feature: `sim`.\n", + "It is in *italics*, which indicates it is an edge feature.\n", + "\n", + "We count how many similar pairs their are, how many 100% similar pairs there are,\n", + "and how many more than 90% but not 100%." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "055e8536-98e4-4c30-a7a2-aba9d581d7f7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0.01s 1199 results\n" + ] + } + ], + "source": [ + "query = \"\"\"\n", + "sentence\n", + "-sim> sentence\n", + "\"\"\"\n", + "results = A.search(query)" + ] + }, + { + "cell_type": "markdown", + "id": "ba934873-7783-49ab-a10a-3d62bb3fe776", + "metadata": {}, + "source": [ + "We collect the 100% results, in bidirectional form." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "16e963dd-5be2-4cbb-9b8f-a366b6fb506b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0.02s 1234 results\n" + ] + } + ], + "source": [ + "query100 = \"\"\"\n", + "sentence\n", + " sentence\n", + "\"\"\"\n", + "results100 = A.search(query100)" + ] + }, + { + "cell_type": "markdown", + "id": "4958ee6b-2d67-428f-a8e3-5d9b39284f0e", + "metadata": {}, + "source": [ + "Let's show the 90+% pairs." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e97d4c90-e166-4e35-ac33-18fdb8ef0d3a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0.01s 724 results\n" + ] + } + ], + "source": [ + "query90 = \"\"\"\n", + "sentence\n", + "-sim>90> sentence\n", + "\"\"\"\n", + "results90 = A.search(query90)" + ] + }, + { + "cell_type": "markdown", + "id": "9adf7387-f2c9-4b4d-8e0b-8431dcb083f4", + "metadata": {}, + "source": [ + "Let's weed out the 100% pairs:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "55e7a921-a323-44a7-9fdd-7b66fe94ea5a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "107" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results100set = set(results100)\n", + "\n", + "results = tuple(r for r in results90 if r not in results100set)\n", + "len(results)" + ] + }, + { + "cell_type": "markdown", + "id": "d06355a8-e1f9-4d94-9c45-1b62f06016d3", + "metadata": {}, + "source": [ + "We show the top 50 of these highly similar sentence pairs:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6ad33c70-5837-4b43-958f-be8f34995313", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
npsentencesentence
11 1018:4708669 Il faut observer que la ligne NM , qui est le milieu de la lame PNOM , doit être exactement parallèle à l'axe AB de la première machine, et que la ligne perpendiculaire qui tomberait de l'axe AB sur les planches GH et IK , tombe justement sur cette ligne NM .\n", + "De plus, aux dernières figures, il faut que la même ligne NM , prolongée, passe justement par le centre de la roue Q ,\n", + "et se rencontre faire une ligne droite avec l'axe RS , sur lequel tourne le verre.708698 Enfin vous dites qu'il faut aussi observer que la ligne NM , qui fait le milieu de la lame PNOM , doit être exactement parallèle à l'axe AB de la première machine, et que la ligne perpendiculaire qui tomberait de l'axe AB sur les planches GH et IK , tombe E justement sur cette ligne NM . De plus, aux dernières figures, il faut que la même ligne NM prolongée passe justement par le centre de la roue Q et se rencontre faire une ligne droite avec l'axe RS , sur lequel tourne le verre.
22 2122:10710640 Faisons après cela qu' A , l'un des bouts de cette corde, étant attaché ferme à quelque clou, l'autre C soit derechef soutenu par un homme; et il est évident que cet homme, en C , n'aura besoin, non plus que devant, pour soutenir le poids E , que de la force qu'il faut pour soutenir cent livres: à cause que le clou qui est vers A y fait le même office que l'homme que nous y supposions auparavant. 712335 Puis, si on suppose que A , l'un des bouts de cette corde, soit attaché ferme à quelque clou, et que l'autre C soit derechef soutenu par un homme, il est évident que cet homme en C n'aura besoin non plus que devant, pour soutenir ce poids E , que de la force qu'il faut pour soutenir 100 livres, à cause que le clou qui sera vers A y fera le même office que l'homme que nous y supposions auparavant.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "A.table(results, withNodes=True, end=2)" + ] + }, + { + "cell_type": "markdown", + "id": "51419d90-d1b8-4421-b81b-52a1eba0726d", + "metadata": {}, + "source": [ + "Unfortunately, the generic mechanism of text-fabric does not show the passage of the second sentence of each similar pair.\n", + "\n", + "We can make a display by hand, and also show the similarity:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d82e9b4e-e565-407b-80c5-5d67a9afe2f5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "### 1 similarity 96\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
1 1018:4   Il faut observer que la ligne NM , qui est le milieu de la lame PNOM , doit être exactement parallèle à l'axe AB de la première machine, et que la ligne perpendiculaire qui tomberait de l'axe AB sur les planches GH et IK , tombe justement sur cette ligne NM .\n", + "De plus, aux dernières figures, il faut que la même ligne NM , prolongée, passe justement par le centre de la roue Q ,\n", + "et se rencontre faire une ligne droite avec l'axe RS , sur lequel tourne le verre.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
1 1019:11   Enfin vous dites qu'il faut aussi observer que la ligne NM , qui fait le milieu de la lame PNOM , doit être exactement parallèle à l'axe AB de la première machine, et que la ligne perpendiculaire qui tomberait de l'axe AB sur les planches GH et IK , tombe E justement sur cette ligne NM . De plus, aux dernières figures, il faut que la même ligne NM prolongée passe justement par le centre de la roue Q et se rencontre faire une ligne droite avec l'axe RS , sur lequel tourne le verre.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### 2 similarity 92\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
2 2122:10   Faisons après cela qu' A , l'un des bouts de cette corde, étant attaché ferme à quelque clou, l'autre C soit derechef soutenu par un homme; et il est évident que cet homme, en C , n'aura besoin, non plus que devant, pour soutenir le poids E , que de la force qu'il faut pour soutenir cent livres: à cause que le clou qui est vers A y fait le même office que l'homme que nous y supposions auparavant.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
2 2164:15   Puis, si on suppose que A , l'un des bouts de cette corde, soit attaché ferme à quelque clou, et que l'autre C soit derechef soutenu par un homme, il est évident que cet homme en C n'aura besoin non plus que devant, pour soutenir ce poids E , que de la force qu'il faut pour soutenir 100 livres, à cause que le clou qui sera vers A y fera le même office que l'homme que nous y supposions auparavant.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### 3 similarity 97\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
2 2122:10   Enfin, posons que cet homme qui est vers C tire la corde pour faire hausser le poids E ; et il est évident que, s'il y emploie la force qu'il faut pour lever 100 livres à la hauteur de deux pieds, il fera hausser ce poids E , qui en pèse 200, de la hauteur d'un pied: car la corde ABC étant doublée comme elle est, on la doit tirer de deux pieds par le bout C , pour faire autant hausser le poids E que si deux hommes la tiraient, l'un par le bout A et l'autre par le bout C , chacun de la longueur d'un pied seulement.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
2 2164:15   Enfin, supposant que cet homme, qui est vers C , tire la corde pour faire hausser le poids E , il est évident que, s'il y emploie la force qu'il faut pour lever 100 livres à la hauteur de deux pieds, il fera hausser ce poids E , qui en pèse deux cents, de la hauteur d'un pied; car la corde ABC étant doublée comme elle est, on la doit tirer de deux pieds, par le bout C , pour faire autant hausser ce poids E que si deux hommes la tiraient, l'un par le bout A et l'autre par le bout C , chacun de la longueur d'un pied seulement.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### 4 similarity 95\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
2 2122:29   Et pour mesurer exactement quelle doit être cette force en chaque point de la ligne courbe ABCDE , il faut savoir qu'elle y agit tout de même que si elle traînait le poids sur un plan circulairement incliné, et que l'inclination de chacun des points de ce plan circulaire se doit mesurer par celle de la ligne droite qui touche le cercle en ce point.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
2 2164:25   Or, pour mesurer exactement quelle doit être cette force en chaque point de la ligne courbe ABCDE , il faut penser qu'elle y agit tout de même que si elle traînait le poids sur un plan circulairement incliné, et l'inclination de chacun des points de ce plan circulaire, ou sphérique, se doit mesurer par celle de la ligne droite qui touche le cercle en ce point-là.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### 5 similarity 96\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
2 2144:8   Ergo sumendo quodlibet punctum in recta BE , et ab eo ducendo ordinatam OI , a puncto autem B ordinatam BC , major erit proportio CD ad DI ,\n", + "quam quadrati BC ad quadratum OI , quia punctum < O > est extra parabolen.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
2 2144:10   Ergo sumendo quodlibet punctum in recta BE , et ab eo ducendo ordinatam OI , a puncto autem B ordinatam BC , major erit proportio CD ad DI ,\n", + "quam quadrati BC ad quadratum OI , quia punctum O est extra ellipsim.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### 6 similarity 93\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
2 2144:8   Ergo sumendo quodlibet punctum in recta BE , et ab eo ducendo ordinatam OI , a puncto autem B ordinatam BC , major erit proportio CD ad DI ,\n", + "quam quadrati BC ad quadratum OI , quia punctum < O > est extra parabolen.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
2 2144:12   Ergo sumendo quodlibet punctum in recta BE , et ab eo ducendo ordinatam OI , a puncto autem B ordinatam BC , major erit proportio CD ad DI ,\n", + "quam BC ad OI , quia punctum O est extra hyperbolen.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### 7 similarity 97\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
2 2144:9   Sed propter similitudinem triangulorum, ut BC quadratum ad OI quadratum, ita CE quadratum ad IE quadratum; major <igitur> erit proportio CD ad DI , quam quadrati CE ad quadratum IE .\n", + "<c> cum diametro concurrens.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
2 2144:11   Sed propter similitudinem triangulorum, ut BC quadratum ad OI quadratum, ita CE quadratum ad IE quadratum; major igitur erit proportio CD ad DI , quam quadrati CE ad quadratum IE .\n", + " \n", + "<d> cto E cum diametro concurrens.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### 8 similarity 92\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
2 2144:10   Ergo sumendo quodlibet punctum in recta BE , et ab eo ducendo ordinatam OI , a puncto autem B ordinatam BC , major erit proportio CD ad DI ,\n", + "quam quadrati BC ad quadratum OI , quia punctum O est extra ellipsim.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
2 2144:12   Ergo sumendo quodlibet punctum in recta BE , et ab eo ducendo ordinatam OI , a puncto autem B ordinatam BC , major erit proportio CD ad DI ,\n", + "quam BC ad OI , quia punctum O est extra hyperbolen.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### 9 similarity 98\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
2 2144:14   Cum autem punctum B detur, <datur applicata BC ; ergo punctum C >. Datur etiam CD . Sit igitur CD aequalis B datae.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
2 2144:17   Cum autem punctum B detur, datur applicata BC ; ergo punctum C . Datur etiam CD . Sit igitur CD aequalis D datae.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### 10 similarity 98\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
2 2144:14   Cum autem punctum B detur, <datur applicata BC ; ergo punctum C >. Datur etiam CD . Sit igitur CD aequalis B datae.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
2 2144:20   Cum autem punctum B detur, datur applicata BC ; ergo punctum C . Datur etiam CD . Sit igitur CD aequalis D datae.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "for (i, (s1, s2)) in enumerate(results[0:10]):\n", + " sim = dict(E.sim.f(s1))[s2]\n", + " A.dm(f\"### {i+1} similarity {sim}\\n\")\n", + " A.plain(s1)\n", + " A.plain(s2)" + ] + }, + { + "cell_type": "markdown", + "id": "83ab330a-f78a-46a0-b62f-dfd88db4b247", + "metadata": {}, + "source": [ + "## Edges: low-level\n", + "\n", + "We can list all edges going out from a reference node.\n", + "What we see is tuple of pairs: the target node and the similarity between the reference node and that target node." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "246c6cc2-92c7-4e99-8c6b-776280a7f337", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "refNode1=722606\n" + ] + }, + { + "data": { + "text/plain": [ + "((722626, 94),)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "refNode1 = results[-1][0]\n", + "print(f\"{refNode1=}\")\n", + "\n", + "E.sim.f(refNode1)" + ] + }, + { + "cell_type": "markdown", + "id": "e48070ee-eb27-4adc-adaf-68e0af1c8c1e", + "metadata": {}, + "source": [ + "Likewise, we can observe the nodes that target the reference node:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f1f3cc0b-df96-4e29-9d9b-779e1660b50a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "refNode2=722626\n" + ] + }, + { + "data": { + "text/plain": [ + "((722606, 94),)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "refNode2 = results[-1][1]\n", + "print(f\"{refNode2=}\")\n", + "\n", + "E.sim.t(refNode2)" + ] + }, + { + "cell_type": "markdown", + "id": "333412e0-75a8-4901-b062-07a1673293a8", + "metadata": {}, + "source": [ + "Both sets of nodes are similar to the reference node and it is inconvenient to use both `.f()` and `.t()` to get the similar lines.\n", + "\n", + "But there is another way:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a3a37416-96d9-49f8-92d5-b74cb7af6f9d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((719355, 80), (722626, 94))" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "E.sim.b(refNode1)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "1d6ecc9b-b990-41ee-9b35-babec04d03f0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((722606, 94),)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "E.sim.b(refNode2)" + ] + }, + { + "cell_type": "markdown", + "id": "09577595-dbb0-4322-a61f-1615613325d3", + "metadata": { + "tags": [] + }, + "source": [ + "---\n", + "\n", + "# Contents\n", + "\n", + "* **[start](start.ipynb)** intro and highlights\n", + "* **[search](search.ipynb)** turbo charge your hand-coding with search templates\n", + "* **[compute](compute.ipynb)** sink down a level and compute it yourself\n", + "* **[exportExcel](exportExcel.ipynb)** make tailor-made spreadsheets out of your results\n", + "\n", + "Advanced\n", + "\n", + "* **similar sentences** find similar sentences\n", + "\n", + "CC-BY Dirk Roorda" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorial/start.ipynb b/tutorial/start.ipynb index 48c41dc..98a3eed 100644 --- a/tutorial/start.ipynb +++ b/tutorial/start.ipynb @@ -4,9 +4,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", + "\n", + "\n", + "\n", "\n", "# Start\n", "\n", @@ -179,7 +179,19 @@ { "data": { "text/html": [ - "data: ~/github/CLARIAH/descartes-tf/tf/0.9" + "data: ~/github/CLARIAH/descartes-tf/tf/1.0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "data: ~/github/CLARIAH/descartes-tf/parallels/tf/1.0" ], "text/plain": [ "" @@ -195,9 +207,9 @@ "This is Text-Fabric 11.0.7\n", "Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html\n", "\n", - "26 features found and 0 ignored\n", - " 0.08s Dataset without structure sections in otext:no structure functions in the T-API\n", - " 0.31s All features loaded/computed - for details use TF.isLoaded()\n", + "28 features found and 0 ignored\n", + " 0.09s Dataset without structure sections in otext:no structure functions in the T-API\n", + " 0.35s All features loaded/computed - for details use TF.isLoaded()\n", " 0.01s All additional features loaded - for details use TF.isLoaded()\n" ] }, @@ -219,28 +231,21 @@ "\n", " volume\n", " 8\n", - " 85287.50\n", + " 85241.88\n", " 100\n", "\n", "\n", "\n", " letter\n", " 725\n", - " 941.10\n", + " 940.60\n", " 100\n", "\n", "\n", "\n", " page\n", " 2884\n", - " 236.58\n", - " 100\n", - "\n", - "\n", - "\n", - " p\n", - " 8438\n", - " 80.86\n", + " 236.45\n", " 100\n", "\n", "\n", @@ -252,10 +257,17 @@ "\n", "\n", "\n", - " head\n", - " 725\n", - " 23.37\n", - " 2\n", + " opener\n", + " 545\n", + " 1.97\n", + " 0\n", + "\n", + "\n", + "\n", + " closer\n", + " 541\n", + " 13.10\n", + " 1\n", "\n", "\n", "\n", @@ -266,10 +278,24 @@ "\n", "\n", "\n", - " closer\n", - " 541\n", - " 13.10\n", - " 1\n", + " head\n", + " 725\n", + " 23.37\n", + " 2\n", + "\n", + "\n", + "\n", + " p\n", + " 8438\n", + " 80.82\n", + " 100\n", + "\n", + "\n", + "\n", + " sentence\n", + " 14332\n", + " 45.74\n", + " 96\n", "\n", "\n", "\n", @@ -280,16 +306,9 @@ "\n", "\n", "\n", - " opener\n", - " 545\n", - " 1.97\n", - " 0\n", - "\n", - "\n", - "\n", " formula\n", " 6200\n", - " 1.27\n", + " 1.21\n", " 1\n", "\n", "\n", @@ -302,19 +321,85 @@ "\n", "\n", " word\n", - " 682300\n", + " 681935\n", " 1.00\n", " 100\n", "\n", "
\n", " Sets: no custom sets
\n", " Features:
\n", + "
Similar Sentences\n", + "
\n", + "\n", + "
\n", + "
\n", + "sim\n", + "
\n", + "
int
\n", + "\n", + "
\n", + " similarity between sentences based on the Levenshtein ratio\n", + "
\n", + "\n", + "
\n", + "
author:
\n", + "
René Descartes
\n", + "
\n", + "\n", + "
\n", + "
contributors:
\n", + "
Erik-Jan Bos; Katsuzo Murakami (University of Tokyo); Meguru Sasaki (École normale superieure d'Hokkaido); Takehumi Tokoro (University of Chyuo)
\n", + "
\n", + "\n", + "
\n", + "
converters:
\n", + "
Dirk Roorda (Text-Fabric)
\n", + "
\n", + "\n", + "
\n", + "
dateWritten:
\n", + "
2023-01-12T15:51:13Z
\n", + "
\n", + "\n", + "
\n", + "
descriptionTf:
\n", + "
Critical edition with various variants
\n", + "
\n", + "\n", + "
\n", + "
institute:
\n", + "
KNAW/Huygens Amsterdam
\n", + "
\n", + "\n", + "
\n", + "
language:
\n", + "
nld
\n", + "
\n", + "\n", + "
\n", + "
sourceFormat:
\n", + "
TEI
\n", + "
\n", + "\n", + "
\n", + "
writtenBy:
\n", + "
Text-Fabric
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", "
Descartes = Descartes, all letters\n", "
\n", "\n", "
\n", "
\n", - "alt_date\n", + "alt_date\n", "
\n", "
str
\n", "\n", @@ -339,7 +424,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -374,7 +459,7 @@ "\n", "
\n", "
\n", - "alt_id\n", + "alt_id\n", "
\n", "
str
\n", "\n", @@ -399,7 +484,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -434,7 +519,7 @@ "\n", "
\n", "
\n", - "cert\n", + "cert\n", "
\n", "
str
\n", "\n", @@ -459,7 +544,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -499,7 +584,7 @@ "\n", "
\n", "
\n", - "date\n", + "date\n", "
\n", "
str
\n", "\n", @@ -524,7 +609,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -559,7 +644,7 @@ "\n", "
\n", "
\n", - "id\n", + "id\n", "
\n", "
str
\n", "\n", @@ -584,7 +669,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -619,7 +704,7 @@ "\n", "
\n", "
\n", - "intermediary\n", + "intermediary\n", "
\n", "
str
\n", "\n", @@ -644,7 +729,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -679,7 +764,7 @@ "\n", "
\n", "
\n", - "isitalic\n", + "isitalic\n", "
\n", "
str
\n", "\n", @@ -704,7 +789,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -739,7 +824,7 @@ "\n", "
\n", "
\n", - "ismargin\n", + "ismargin\n", "
\n", "
str
\n", "\n", @@ -764,7 +849,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -799,7 +884,7 @@ "\n", "
\n", "
\n", - "issub\n", + "issub\n", "
\n", "
str
\n", "\n", @@ -824,7 +909,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -859,7 +944,7 @@ "\n", "
\n", "
\n", - "issup\n", + "issup\n", "
\n", "
str
\n", "\n", @@ -884,7 +969,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -919,7 +1004,7 @@ "\n", "
\n", "
\n", - "language\n", + "language\n", "
\n", "
str
\n", "\n", @@ -944,7 +1029,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -979,7 +1064,7 @@ "\n", "
\n", "
\n", - "level\n", + "level\n", "
\n", "
str
\n", "\n", @@ -1004,7 +1089,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -1039,7 +1124,7 @@ "\n", "
\n", "
\n", - "n\n", + "n\n", "
\n", "
int
\n", "\n", @@ -1064,7 +1149,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -1099,12 +1184,12 @@ "\n", "
\n", "
\n", - "notation\n", + "notation\n", "
\n", "
str
\n", "\n", "
\n", - " formalism used (TeX)\n", + " notation method of a formula\n", "
\n", "\n", "
\n", @@ -1124,7 +1209,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -1159,7 +1244,7 @@ "\n", "
\n", "
\n", - "otype\n", + "otype\n", "
\n", "
str
\n", "\n", @@ -1184,7 +1269,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -1219,7 +1304,7 @@ "\n", "
\n", "
\n", - "punc\n", + "punc\n", "
\n", "
str
\n", "\n", @@ -1244,7 +1329,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -1279,7 +1364,7 @@ "\n", "
\n", "
\n", - "recipient\n", + "recipient\n", "
\n", "
str
\n", "\n", @@ -1304,7 +1389,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -1339,7 +1424,7 @@ "\n", "
\n", "
\n", - "recipientloc\n", + "recipientloc\n", "
\n", "
str
\n", "\n", @@ -1364,7 +1449,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -1399,7 +1484,7 @@ "\n", "
\n", "
\n", - "resp\n", + "resp\n", "
\n", "
str
\n", "\n", @@ -1424,7 +1509,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -1464,7 +1549,7 @@ "\n", "
\n", "
\n", - "sender\n", + "sender\n", "
\n", "
str
\n", "\n", @@ -1489,7 +1574,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -1524,7 +1609,7 @@ "\n", "
\n", "
\n", - "senderloc\n", + "senderloc\n", "
\n", "
str
\n", "\n", @@ -1549,7 +1634,67 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", + "
\n", + "\n", + "
\n", + "
descriptionTf:
\n", + "
Critical edition with various variants
\n", + "
\n", + "\n", + "
\n", + "
institute:
\n", + "
KNAW/Huygens Amsterdam
\n", + "
\n", + "\n", + "
\n", + "
language:
\n", + "
nld
\n", + "
\n", + "\n", + "
\n", + "
sourceFormat:
\n", + "
TEI
\n", + "
\n", + "\n", + "
\n", + "
writtenBy:
\n", + "
Text-Fabric
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "tex\n", + "
\n", + "
str
\n", + "\n", + "
\n", + " unformatted TeX code of a formula, without the `$`\n", + "
\n", + "\n", + "
\n", + "
author:
\n", + "
René Descartes
\n", + "
\n", + "\n", + "
\n", + "
contributors:
\n", + "
Erik-Jan Bos; Katsuzo Murakami (University of Tokyo); Meguru Sasaki (École normale superieure d'Hokkaido); Takehumi Tokoro (University of Chyuo)
\n", + "
\n", + "\n", + "
\n", + "
converters:
\n", + "
Dirk Roorda (Text-Fabric)
\n", + "
\n", + "\n", + "
\n", + "
dateWritten:
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -1584,7 +1729,7 @@ "\n", "
\n", "
\n", - "trans\n", + "trans\n", "
\n", "
str
\n", "\n", @@ -1609,7 +1754,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -1644,12 +1789,12 @@ "\n", "
\n", "
\n", - "typ\n", + "typ\n", "
\n", "
str
\n", "\n", "
\n", - " kind of a node; \"empty\" means: deliberately empty slot; \"head\" means: header of letter; \"symbol\" or \"illustration\" is the kind of a figure\n", + " kind of a node; \"empty\"; \"formula\", \"head\", \"symbol\", \"illustration\"\n", "
\n", "\n", "
\n", @@ -1669,7 +1814,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -1704,7 +1849,7 @@ "\n", "
\n", "
\n", - "url\n", + "url\n", "
\n", "
str
\n", "\n", @@ -1729,7 +1874,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -1764,7 +1909,7 @@ "\n", "
\n", "
\n", - "oslots\n", + "oslots\n", "
\n", "
none
\n", "\n", @@ -1789,7 +1934,7 @@ "\n", "
\n", "
dateWritten:
\n", - "
2023-01-11T14:04:12Z
\n", + "
2023-01-12T13:07:27Z
\n", "
\n", "\n", "
\n", @@ -2385,6 +2530,23 @@ "\t--hl-strong: hsla( 60, 100%, 70%, 0.9 );\n", "\t--hl-rim: hsla( 55, 80%, 50%, 1.0 );\n", "}\n", + ".italic {\n", + " font-style: italic;\n", + "}\n", + ".margin {\n", + " position: relative;\n", + " top: -0.3em;\n", + " font-weight: bold;\n", + " color: #0000ee;\n", + "}\n", + ".sub {\n", + " vertical-align: sub;\n", + " font-size: small;\n", + "}\n", + ".sup {\n", + " vertical-align: super;\n", + " font-size: small;\n", + "}\n", "" ], "text/plain": [ @@ -2399,7 +2561,7 @@ "text/html": [ "\n", "