Skip to content

Commit

Permalink
Unify str.strip
Browse files Browse the repository at this point in the history
  • Loading branch information
plutasnyy committed Feb 5, 2025
1 parent 30e771c commit 08820b4
Show file tree
Hide file tree
Showing 8 changed files with 551 additions and 136 deletions.
62 changes: 46 additions & 16 deletions test_unstructured/documents/unstructured_json_output/example.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
"element_id": "3a6b156a81764e17be128264241f8136",
"metadata": {
"category_depth": 0,
"filename": "example.pdf",
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "897a8a47377c4ad6aab839a929879537",
"text_as_html": "<div class=\"Page\" data-page-number=\"1\" id=\"3a6b156a81764e17be128264241f8136\" />"
Expand All @@ -15,7 +18,10 @@
"element_id": "45b3d0053468484ba1c7b53998115412",
"metadata": {
"category_depth": 1,
"filename": "example.pdf",
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "3a6b156a81764e17be128264241f8136",
"text_as_html": "<header class=\"Header\" id=\"45b3d0053468484ba1c7b53998115412\" />"
Expand All @@ -27,10 +33,13 @@
"element_id": "c95473e8a3704fc2b418697f9fddb27b",
"metadata": {
"category_depth": 2,
"filename": "example.pdf",
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "45b3d0053468484ba1c7b53998115412",
"text_as_html": "<h1 class=\"Title\" id=\"c95473e8a3704fc2b418697f9fddb27b\">Header </h1>"
"text_as_html": "<h1 class=\"Title\" id=\"c95473e8a3704fc2b418697f9fddb27b\">Header</h1>"
},
"text": "Header",
"type": "Title"
Expand All @@ -39,10 +48,13 @@
"element_id": "379cbfdc16d44bd6a59e6cfabe6438d5",
"metadata": {
"category_depth": 2,
"filename": "example.pdf",
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "45b3d0053468484ba1c7b53998115412",
"text_as_html": "<time class=\"CalendarDate\" id=\"379cbfdc16d44bd6a59e6cfabe6438d5\">Date: October 30, 2023 </time>"
"text_as_html": "<time class=\"CalendarDate\" id=\"379cbfdc16d44bd6a59e6cfabe6438d5\">Date: October 30, 2023</time>"
},
"text": "Date: October 30, 2023",
"type": "UncategorizedText"
Expand All @@ -51,10 +63,13 @@
"element_id": "637c2f6935fb4353a5f73025ce04619d",
"metadata": {
"category_depth": 1,
"filename": "example.pdf",
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "3a6b156a81764e17be128264241f8136",
"text_as_html": "<form class=\"Form\" id=\"637c2f6935fb4353a5f73025ce04619d\"> <label class=\"FormField\" for=\"company-name\" id=\"50027cccbe1948c9853ce0de37b635c2\">From field name </label><input class=\"FormFieldValue\" id=\"0032242af75c4b37984ea7fea9aac74c\" value=\"Example value\" /></form>"
"text_as_html": "<form class=\"Form\" id=\"637c2f6935fb4353a5f73025ce04619d\"><label class=\"FormField\" for=\"company-name\" id=\"50027cccbe1948c9853ce0de37b635c2\">From field name</label><input class=\"FormFieldValue\" id=\"0032242af75c4b37984ea7fea9aac74c\" value=\"Example value\" /></form>"
},
"text": "From field name Example value",
"type": "UncategorizedText"
Expand All @@ -63,7 +78,10 @@
"element_id": "592422373ed741b68a077e2003f8ed81",
"metadata": {
"category_depth": 1,
"filename": "example.pdf",
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "3a6b156a81764e17be128264241f8136",
"text_as_html": "<section class=\"Section\" id=\"592422373ed741b68a077e2003f8ed81\" />"
Expand All @@ -75,10 +93,13 @@
"element_id": "dc3792d4422e444f90876b56d0cfb20d",
"metadata": {
"category_depth": 2,
"filename": "example.pdf",
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "592422373ed741b68a077e2003f8ed81",
"text_as_html": "<table class=\"Table\" id=\"dc3792d4422e444f90876b56d0cfb20d\"> <thead> <tr> <th>Description</th><th>Row header</th></tr></thead><tbody> <tr> <td>Value description</td><td>50 $ (1.32 %)</td></tr></tbody></table>"
"text_as_html": "<table class=\"Table\" id=\"dc3792d4422e444f90876b56d0cfb20d\"><thead><tr><th>Description</th><th>Row header</th></tr></thead><tbody><tr><td>Value description</td><td><span>50 $</span><span>(1.32 %)</span></td></tr></tbody></table>"
},
"text": "Description Row header Value description 50 $ (1.32 %)",
"type": "Table"
Expand All @@ -87,7 +108,10 @@
"element_id": "1032242af75c4b37984ea7fea9aac74c",
"metadata": {
"category_depth": 1,
"filename": "example.pdf",
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "3a6b156a81764e17be128264241f8136",
"text_as_html": "<section class=\"Section\" id=\"1032242af75c4b37984ea7fea9aac74c\" />"
Expand All @@ -99,10 +123,13 @@
"element_id": "2a4e2c4a689f4f9a8c180b6b521e45c3",
"metadata": {
"category_depth": 2,
"filename": "example.pdf",
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "1032242af75c4b37984ea7fea9aac74c",
"text_as_html": "<h2 class=\"Subtitle\" id=\"2a4e2c4a689f4f9a8c180b6b521e45c3\">2. Subtitle </h2>"
"text_as_html": "<h2 class=\"Subtitle\" id=\"2a4e2c4a689f4f9a8c180b6b521e45c3\">2. Subtitle</h2>"
},
"text": "2. Subtitle",
"type": "Title"
Expand All @@ -111,10 +138,13 @@
"element_id": "5591f7a4df01447e82515ce45f686fbe",
"metadata": {
"category_depth": 2,
"filename": "example.pdf",
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "1032242af75c4b37984ea7fea9aac74c",
"text_as_html": "<p class=\"NarrativeText\" id=\"5591f7a4df01447e82515ce45f686fbe\">Paragraph text </p>"
"text_as_html": "<p class=\"NarrativeText\" id=\"5591f7a4df01447e82515ce45f686fbe\">Paragraph text</p>"
},
"text": "Paragraph text",
"type": "NarrativeText"
Expand Down
Loading

0 comments on commit 08820b4

Please sign in to comment.