Skip to content

Commit

Permalink
test: mise à jour des données avec les fiches au format DSFR
Browse files Browse the repository at this point in the history
  • Loading branch information
m-maillot committed Oct 16, 2024
1 parent e4338d7 commit 66bcffa
Show file tree
Hide file tree
Showing 15 changed files with 27,046 additions and 5,220 deletions.
1,102 changes: 1,098 additions & 4 deletions local.data.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
"@babel/preset-env": "^7.16.4",
"@swc/core": "^1.3.21",
"@swc/jest": "^0.2.23",
"@types/jsdom": "^21.1.7",
"@typescript-eslint/eslint-plugin": "^5.45.0",
"@typescript-eslint/parser": "^5.45.0",
"babel-jest": "^27.4.4",
Expand Down
700 changes: 481 additions & 219 deletions src/fetch-data/__tests__/__snapshots__/parseDom.test.ts.snap

Large diffs are not rendered by default.

1,554 changes: 1,554 additions & 0 deletions src/fetch-data/__tests__/article-complex-html.html

Large diffs are not rendered by default.

2,316 changes: 0 additions & 2,316 deletions src/fetch-data/__tests__/article-img-within-picture.html

This file was deleted.

2,554 changes: 0 additions & 2,554 deletions src/fetch-data/__tests__/article-picture.html

This file was deleted.

9,151 changes: 9,151 additions & 0 deletions src/fetch-data/__tests__/article-with-updated-date.html

Large diffs are not rendered by default.

9,150 changes: 9,150 additions & 0 deletions src/fetch-data/__tests__/article-without-updated-date.html

Large diffs are not rendered by default.

1,501 changes: 1,501 additions & 0 deletions src/fetch-data/__tests__/double-video.html

Large diffs are not rendered by default.

1,553 changes: 1,553 additions & 0 deletions src/fetch-data/__tests__/h2-in-list.html

Large diffs are not rendered by default.

2,199 changes: 2,199 additions & 0 deletions src/fetch-data/__tests__/harcelement-travail.html

Large diffs are not rendered by default.

199 changes: 169 additions & 30 deletions src/fetch-data/__tests__/parseDom.test.ts

Large diffs are not rendered by default.

12 changes: 3 additions & 9 deletions src/fetch-data/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,12 @@ const FEED_URL = "https://travail-emploi.gouv.fr/?page=oseo_json";

const limit = pLimit(10);

export async function fetchFeed(url) {
const response = await got.post(injectToken(url), {
http2: true,
retry: 3,
});
const { fiches: feed } = JSON.parse(response.body);
const localJson = fs.readFileSync(
export async function fetchFeed() {
const localJsonData = fs.readFileSync(
path.join(__dirname, "../../local.data.json"),
"utf8"
);
const { fiches: localFeed } = JSON.parse(localJson);
return [...feed, ...localFeed];
return JSON.parse(localJsonData).fiches;
}

export async function scrap(urls) {
Expand Down
248 changes: 160 additions & 88 deletions src/fetch-data/parseDom.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { ParseError } from "got";
import { encode } from "../email";
import { extractReferences } from "./referenceExtractor";
import { resolveReferences } from "./referenceResolver";
import { JSDOM } from "jsdom";

const $$ = (node, selector) => Array.from(node.querySelectorAll(selector));
const $ = (node, selector) => node.querySelector(selector);
Expand Down Expand Up @@ -123,7 +124,7 @@ const getReferences = (text) => {
return resolveReferences(references);
};

const textClean = (text, noNbsp = false) => {
export const textClean = (text, noNbsp = false) => {
const regexStr = "\\n";
return text
.replace(
Expand All @@ -135,73 +136,163 @@ const textClean = (text, noNbsp = false) => {
.trim();
};

const titleTags = ["h2", "h3", "h4", "h5"];
function parseHTMLSections(dom) {
const document = dom.window.document;

const getSections = (
article,
children,
sections = [
{
anchor: "",
description: "",
const mainContent = $(document, ".main-content");
if (!mainContent) {
throw new Error('No <div class="main-content"> found in the HTML content.');
}

const sections = [];

const h2Tags = $$(mainContent, "h2");

h2Tags.forEach((h2Tag) => {
const section = {
title: textClean(h2Tag.textContent, true) || "",
html: "",
references: {},
text: "",
title: "",
},
],
fromDiv = false
) => {
for (let i = 0; i < children.length; i++) {
const el = children[i];
const lastSection = sections[sections.length - 1];
if (
!fromDiv &&
titleTags.indexOf(el.tagName.toLowerCase()) !== -1 &&
el.textContent.trim() !== ""
) {
const text = textClean(lastSection.text, true);
lastSection.html = textClean(lastSection.html);
lastSection.description = text.slice(0, 200).trim();
lastSection.text = text;
lastSection.references = getReferences(text);
sections.push({
anchor:
el.getAttribute("id") || slugify(textClean(el.textContent, true)),
description: "",
html: "",
references: {},
text: "",
title: textClean(el.textContent, true),
});
} else if (
["section", "article", "div"].indexOf(el.tagName.toLowerCase()) !== -1
) {
if (el.tagName === "DIV") {
lastSection.html += el.outerHTML;
lastSection.text += el.textContent;
};

let nextSibling = h2Tag.nextElementSibling;
if (!nextSibling) {
nextSibling = h2Tag.parentElement
? h2Tag.parentElement.nextElementSibling
: undefined;
if (!nextSibling && h2Tag.parentElement) {
nextSibling = h2Tag.parentElement.parentElement
? h2Tag.parentElement.parentElement.nextElementSibling
: undefined;
}
sections = getSections(
article,
el.children,
sections,
el.tagName === "DIV"
);
} else if (
lastSection &&
titleTags.indexOf(el.tagName.toLowerCase()) === -1
) {
lastSection.html += el.outerHTML;
lastSection.text += el.textContent;
}
const sectionHtmlContent = [];
const sectionTextContent = [];

while (nextSibling && nextSibling.nodeName !== "H2") {
sectionHtmlContent.push(textClean(nextSibling.outerHTML || "", true));
sectionTextContent.push(textClean(nextSibling.textContent || "", true));
nextSibling = nextSibling.nextElementSibling;
}

section.html = sectionHtmlContent.join("").trim();
section.text = sectionTextContent.join("").trim();
sections.push(section);
});

if (sections.find((section) => section.html === "")) {
return [
{
title: "Contenu",
html: mainContent.innerHTML,
text: mainContent.textContent,
},
];
}
return sections.map((section) => ({
...section,
// Sometimes, we have all the html in a section
// We check a second times and delete HTML from the h2 found
// (H2 should not be in a section)
html: removeExtraH2(section.html),
}));
}

const removeExtraH2 = (html) => {
const dom = new JSDOM(`<div>${html}</div>`);
const document = dom.window.document;
const mainDiv = $(document, "div");

const firstH2 = $(mainDiv, "h2");

if (firstH2) {
let parent = firstH2.parentElement;
let h2 = firstH2;
while (parent.nextElementSibling) {
parent.nextElementSibling.remove();
}
while (firstH2.nextElementSibling) {
firstH2.nextElementSibling.remove();
}
h2.remove();
}

return textClean(mainDiv.innerHTML, true);
};

const parseHighlight = (dom) => {
const document = dom.window.document;

const mainContent = $(document, ".main-content");
if (!mainContent) {
throw new Error('No <div class="main-content"> found in the HTML content.');
}

const highlightHtmlContent = [];
const highlightTextContent = [];

let nextSibling = mainContent.firstElementChild;
while (nextSibling && nextSibling.nodeName !== "H2") {
highlightHtmlContent.push(textClean(nextSibling.outerHTML || "", true));
highlightTextContent.push(textClean(nextSibling.textContent || "", true));
nextSibling = nextSibling.nextSibling;
}

if (highlightHtmlContent.length > 0) {
return {
title: "",
html: textClean(highlightHtmlContent.join("").trim(), true),
text: highlightTextContent.join("").trim(),
};
}
return undefined;
};

const getDate = (article) => {
const firstParagraph = $(article, "p");

let publicationAt = null;
let updatedAt = null;

if (!firstParagraph) {
throw new Error("Can't find the updated date, first paragraph missing");
}

const spans = $$(firstParagraph, "span");
spans.forEach((span) => {
const textContent = span.textContent;
if (textContent.includes("Publié le")) {
publicationAt = textContent.match(/\d{1,2}\/\d{1,2}\/\d{4}/);
}
if (textContent.includes("Mis à jour le")) {
updatedAt = textContent.match(/\d{1,2}\/\d{1,2}\/\d{4}/);
}
});

if (updatedAt) {
return updatedAt[0];
}
if (publicationAt) {
return publicationAt[0];
}
return sections;
throw new Error("Can't find the updated date in the first paragraph");
};

const populateSections = (sections) => {
return sections.map((section) => ({
anchor: slugify(section.title),
description: section.text.slice(0, 200),
html: section.html,
references: getReferences(section.text),
text: section.text,
title: section.title,
}));
};

export function parseDom(dom, id, url) {
const article = $(dom.window.document, "main");
const article = $(dom.window.document, "article");
if (!article) {
throw new ParseError("no <main>");
throw new ParseError("no <article>");
}
if (!id) {
throw new ParseError(`No id`);
Expand All @@ -212,8 +303,6 @@ export function parseDom(dom, id, url) {
$$(article, ".cs_blocs").forEach(flattenCsBlocs);
$$(article, "img").forEach(formatImage);

$$(article, "style").forEach(removeNode);
$$(article, "button").forEach(removeNode);
$$(article, ".oembed-source").forEach(removeNode);

let titleElement = $(article, "h1");
Expand All @@ -225,40 +314,23 @@ export function parseDom(dom, id, url) {
}
const title = textClean(titleElement.textContent, true);

const dateRaw =
$(dom.window.document, "time:nth-child(1)") ||
$(dom.window.document, "time:first-child");
const date = dateRaw?.textContent;
const introImg = $(dom.window.document, "article img")?.outerHTML;
const date = getDate(article);
let intro = $(article, ".fr-text--lead") || "";
intro =
intro &&
textClean(
introImg ? introImg + intro.innerHTML : intro.innerHTML,
true
).replace(/<script[^>]*>([\s\S]*?)<\/script>/g, "");
textClean(intro.innerHTML, true).replace(
/<script[^>]*>([\s\S]*?)<\/script>/g,
""
);
const description =
$(dom.window.document, "meta[name=description]")?.getAttribute("content") ??
"";

let sections = [];

const mainElement = $$(article, `.main-content`)[0];
if (mainElement) {
const articleSectionChildren = mainElement ? [...mainElement.children] : [];

sections = sections.concat(
getSections(mainElement, articleSectionChildren).filter(
({ anchor }) =>
[
"textes-de-reference",
"qui-contacter",
"articles-associes",
"lire-en-complement",
"documents",
].indexOf(anchor) === -1
)
);
let sections = parseHTMLSections(dom);

const highlight = parseHighlight(dom);
if (highlight) {
sections.unshift(highlight);
}

if (sections.length === 0) {
Expand All @@ -270,7 +342,7 @@ export function parseDom(dom, id, url) {
description,
intro,
pubId: id,
sections,
sections: populateSections(sections),
title,
url,
};
Expand Down
26 changes: 26 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1475,6 +1475,15 @@
jest-diff "^27.0.0"
pretty-format "^27.0.0"

"@types/jsdom@^21.1.7":
version "21.1.7"
resolved "https://registry.yarnpkg.com/@types/jsdom/-/jsdom-21.1.7.tgz#9edcb09e0b07ce876e7833922d3274149c898cfa"
integrity sha512-yOriVnggzrnQ3a9OKOCxaVuSug3w3/SbOj5i7VwXWZEyUNl3bLF9V3MfxGbZKuwqJOQyRfqXyROBB1CoZLFWzA==
dependencies:
"@types/node" "*"
"@types/tough-cookie" "*"
parse5 "^7.0.0"

"@types/json-buffer@~3.0.0":
version "3.0.0"
resolved "https://registry.yarnpkg.com/@types/json-buffer/-/json-buffer-3.0.0.tgz#85c1ff0f0948fc159810d4b5be35bf8c20875f64"
Expand Down Expand Up @@ -1519,6 +1528,11 @@
resolved "https://registry.yarnpkg.com/@types/stack-utils/-/stack-utils-2.0.0.tgz#7036640b4e21cc2f259ae826ce843d277dad8cff"
integrity sha512-RJJrrySY7A8havqpGObOB4W92QXKJo63/jFLLgpvOtsGUqbQZ9Sbgl35KMm1DjC6j7AvmmU2bIno+3IyEaemaw==

"@types/tough-cookie@*":
version "4.0.5"
resolved "https://registry.yarnpkg.com/@types/tough-cookie/-/tough-cookie-4.0.5.tgz#cb6e2a691b70cb177c6e3ae9c1d2e8b2ea8cd304"
integrity sha512-/Ad8+nIOV7Rl++6f1BdKxFSMgmoqEoYbHRpPcx3JEfv8VRsQe9Z4mCXeJBzxs7mbHY/XOZZuXlRNfhpVPbs6ZA==

"@types/unist@^2.0.0":
version "2.0.3"
resolved "https://registry.yarnpkg.com/@types/unist/-/unist-2.0.3.tgz#9c088679876f374eb5983f150d4787aa6fb32d7e"
Expand Down Expand Up @@ -2228,6 +2242,11 @@ end-of-stream@^1.1.0:
dependencies:
once "^1.4.0"

entities@^4.4.0:
version "4.5.0"
resolved "https://registry.yarnpkg.com/entities/-/entities-4.5.0.tgz#5d268ea5e7113ec74c4d033b79ea5a35a488fb48"
integrity sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==

error-ex@^1.3.1:
version "1.3.2"
resolved "https://registry.yarnpkg.com/error-ex/-/error-ex-1.3.2.tgz#b4ac40648107fdcdcfae242f428bea8a14d4f1bf"
Expand Down Expand Up @@ -3786,6 +3805,13 @@ [email protected]:
resolved "https://registry.yarnpkg.com/parse5/-/parse5-6.0.1.tgz#e1a1c085c569b3dc08321184f19a39cc27f7c30b"
integrity sha512-Ofn/CTFzRGTTxwpNEs9PP93gXShHcTq255nzRYSKe8AkVpZY7e1fpmTfOyoIvjP5HG7Z2ZM7VS9PPhQGW2pOpw==

parse5@^7.0.0:
version "7.1.2"
resolved "https://registry.yarnpkg.com/parse5/-/parse5-7.1.2.tgz#0736bebbfd77793823240a23b7fc5e010b7f8e32"
integrity sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw==
dependencies:
entities "^4.4.0"

path-exists@^4.0.0:
version "4.0.0"
resolved "https://registry.yarnpkg.com/path-exists/-/path-exists-4.0.0.tgz#513bdbe2d3b95d7762e8c1137efa195c6c61b5b3"
Expand Down

0 comments on commit 66bcffa

Please sign in to comment.