Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move more parsing of strings and html server-side #575

Draft
wants to merge 11 commits into
base: master
Choose a base branch
from
5 changes: 3 additions & 2 deletions modules/node_modules/@frogpond/ccc-google-calendar/index.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import {get} from '@frogpond/ccc-lib'
import {get, fastGetTrimmedText} from '@frogpond/ccc-lib'
import moment from 'moment'
import getUrls from 'get-urls'
import _jsdom from 'jsdom'
const {JSDOM} = _jsdom

function convertGoogleEvents(data, now = moment()) {
let events = data.map((event) => {
const title = fastGetTrimmedText(event.summary || '')
const startTime = moment(event.start.date || event.start.dateTime)
const endTime = moment(event.end.date || event.end.dateTime)
let description = (event.description || '').replace('<br>', '\n')
Expand All @@ -15,7 +16,7 @@ function convertGoogleEvents(data, now = moment()) {
dataSource: 'google',
startTime,
endTime,
title: event.summary || '',
title,
description: description,
location: event.location || '',
isOngoing: startTime.isBefore(now, 'day'),
Expand Down
25 changes: 25 additions & 0 deletions modules/node_modules/@frogpond/ccc-lib/html.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import {toLaxTitleCase} from '@frogpond/titlecase'

import _jsdom from 'jsdom'
const {JSDOM} = _jsdom

export {encode, decode} from 'html-entities'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How did we handle these here before adding this module? Does JSDOM handle this for us automatically when we call textContent?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great point. Looks like JSDOM handles decoding the entities properly for us. I've created a Repl to show the differences between fastGetTrimmedText and JSDOM's textContent.


// Html

export function parseHtml(string) {
return JSDOM.fragment(string).textContent.trim()
}

export function innerTextWithSpaces(elem) {
return JSDOM.fragment(elem).split(/\s+/u).join(' ').trim()
}

export function removeHtmlWithRegex(str) {
return str.replace(/<[^>]*>/gu, ' ')
}

export function fastGetTrimmedText(str) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we have JSDOM here and aren't resource constrained, I'd like to remove this fn in favor of a JSDOM-based solution

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for mentioning this. We will go with the textContent solution.

return removeHtmlWithRegex(str).replace(/\s+/gu, ' ').trim()
}

1 change: 1 addition & 0 deletions modules/node_modules/@frogpond/ccc-lib/index.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
export {get} from './http'
export * from './cache'
export * from './url'
export * from './html'
13 changes: 8 additions & 5 deletions modules/node_modules/@frogpond/ccc-presence/index.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import {get, ONE_HOUR} from '@frogpond/ccc-lib'
import {get, ONE_HOUR, parseHtml, decode} from '@frogpond/ccc-lib'
import mem from 'mem'
import lodash from 'lodash'
import _jsdom from 'jsdom'
Expand Down Expand Up @@ -45,10 +45,13 @@ export function cleanOrg(org) {
// )

let category = org.categories.join(', ')
let meetings =
(org.regularMeetingLocation || '').trim() +
(org.regularMeetingTime || '').trim()
let description = JSDOM.fragment(org.description).textContent.trim()

let meetingTime = org.regularMeetingTime || ''
let meetingLocation = org.regularMeetingLocation || ''
let meetings = `${meetingTime} ${meetingLocation}`.trim()

let description = parseHtml(org.description)

let website = (org.website || '').trim()
if (website && !/^https?:\/\//.test(website)) {
website = `http://${website}`
Expand Down
5 changes: 3 additions & 2 deletions modules/node_modules/@frogpond/ccc-reason-calendar/index.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* eslint-disable camelcase */

import {get} from '@frogpond/ccc-lib'
import {get, fastGetTrimmedText} from '@frogpond/ccc-lib'
import moment from 'moment-timezone'
import dropWhile from 'lodash/dropWhile'
import dropRightWhile from 'lodash/dropRightWhile'
Expand Down Expand Up @@ -113,6 +113,7 @@ function convertReasonEvent(event, now = moment()) {
moment(event.startTime).isBefore(now, 'day') &&
moment(event.endTime).isSameOrAfter(now)

let title = fastGetTrimmedText(event.name || '')
let description = (event.description || '').replace('<br>', '\n')
description = JSDOM.fragment(description).textContent.trim()

Expand All @@ -122,7 +123,7 @@ function convertReasonEvent(event, now = moment()) {
dataSource: 'reason',
startTime: event.startTime,
endTime: event.endTime,
title: event.name || '',
title: title,
description: description,
location: event.location || '',
links: links,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
import {get, ONE_DAY} from '@frogpond/ccc-lib'
import {get, ONE_DAY, fastGetTrimmedText, parseHtml, decode} from '@frogpond/ccc-lib'
import mem from 'mem'
import _jsdom from 'jsdom'
import getUrls from 'get-urls'

const {JSDOM} = _jsdom

export function cleanJob(job) {
const title = fastGetTrimmedText(job.title)
const office = fastGetTrimmedText(job.office)
const hoursPerWeek = fastGetTrimmedText(job.hoursPerWeek)
const timeOfHours = fastGetTrimmedText(job.timeOfHours)

// these all need to retain their newlines
const description = cleanTextBlock(
JSDOM.fragment(job.description).textContent,
)
const comments = cleanTextBlock(JSDOM.fragment(job.comments).textContent)
const skills = cleanTextBlock(JSDOM.fragment(job.skills).textContent)
const howToApply = cleanTextBlock(JSDOM.fragment(job.howToApply).textContent)
const timeline = cleanTextBlock(JSDOM.fragment(job.timeline).textContent)
const timeOfHours = cleanTextBlock(
JSDOM.fragment(job.timeOfHours).textContent,
)
const description = cleanTextBlock(job.description)
const comments = cleanTextBlock(job.comments)
const skills = cleanTextBlock(job.skills)
const howToApply = cleanTextBlock(job.howToApply)
const timeline = cleanTextBlock(job.timeline)

const contactEmail = fixupEmailFormat(job.contactEmail)
const contactPhone = fixupPhoneFormat(job.contactPhone)
Expand All @@ -34,12 +34,12 @@ export function cleanJob(job) {
contactPhone: contactPhone,
description: description,
goodForIncomingStudents: job.goodForIncomingStudents,
hoursPerWeek: job.hoursPerWeek,
hoursPerWeek: hoursPerWeek,
howToApply: howToApply,
id: job.id,
lastModified: job.lastModified,
links: links,
office: job.office,
office: office,
openPositions: job.openPositions,
skills: skills,
timeline: timeline,
Expand All @@ -51,7 +51,9 @@ export function cleanJob(job) {
}

function cleanTextBlock(text) {
return text.replace(/\s+/g, ' ')
return decode(
parseHtml(text).replace(/\t/g, ' ')
)
}

export function getLinksFromJob({description, comments, skills, howToApply}) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import {get, ONE_HOUR} from '@frogpond/ccc-lib'
import {get, ONE_HOUR, innerTextWithSpaces, parseHtml} from '@frogpond/ccc-lib'
import mem from 'mem'
import moment from 'moment-timezone'

Expand All @@ -19,9 +19,17 @@ export async function getStreams({streamClass, sort, dateFrom, dateTo}) {
(resp) => resp.body,
)
const processed = data.results.map((stream) => {
let {starttime} = stream
let {starttime, title, subtitle, performer} = stream

let streamTitle = innerTextWithSpaces(parseHtml(title))
let detail = innerTextWithSpaces(
parseHtml(subtitle || performer || ''),
)

return {
...stream,
title: streamTitle,
subtitle: detail,
starttime: moment
.tz(starttime, 'YYYY-MM-DD HH:mm', 'America/Chicago')
.toISOString(),
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"test": "./scripts/smoke-test.sh"
},
"dependencies": {
"@frogpond/titlecase": "^1.0.0",
"dotenv": "10.0.0",
"esm": "3.2.25",
"get-urls": "10.0.1",
Expand Down
5 changes: 5 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,11 @@
minimatch "^3.0.4"
strip-json-comments "^3.1.1"

"@frogpond/titlecase@^1.0.0":
version "1.0.0"
resolved "https://registry.yarnpkg.com/@frogpond/titlecase/-/titlecase-1.0.0.tgz#993e5371c31b58a839e76fdc531053c55b1a2736"
integrity sha512-C1qKm/J+B+cXo+7+ZHbRnt2iApx/IrxMEXwOxe+ZkeTvSC1nZ2XRWf0xzFuAanpBNDhFcgrxdnR8FMKNHo1scQ==

"@gar/promisify@^1.0.1":
version "1.1.2"
resolved "https://registry.yarnpkg.com/@gar/promisify/-/promisify-1.1.2.tgz#30aa825f11d438671d585bd44e7fd564535fc210"
Expand Down