Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move more parsing of strings and html server-side #575

Draft
wants to merge 11 commits into
base: master
Choose a base branch
from
14 changes: 8 additions & 6 deletions modules/node_modules/@frogpond/ccc-google-calendar/index.js
Original file line number Diff line number Diff line change
@@ -1,21 +1,19 @@
import {get} from '@frogpond/ccc-lib'
import {get, parseHtml} from '@frogpond/ccc-lib'
import moment from 'moment'
import getUrls from 'get-urls'
import _jsdom from 'jsdom'
const {JSDOM} = _jsdom

function convertGoogleEvents(data, now = moment()) {
let events = data.map((event) => {
const title = parseHtml(event.summary || '')
const startTime = moment(event.start.date || event.start.dateTime)
const endTime = moment(event.end.date || event.end.dateTime)
let description = (event.description || '').replace('<br>', '\n')
description = JSDOM.fragment(description).textContent.trim()
let description = cleanTextBlock(event.description || '')

return {
dataSource: 'google',
startTime,
endTime,
title: event.summary || '',
title,
description: description,
location: event.location || '',
isOngoing: startTime.isBefore(now, 'day'),
Expand All @@ -31,6 +29,10 @@ function convertGoogleEvents(data, now = moment()) {
return events
}

function cleanTextBlock(text) {
return parseHtml(text).replace(/\t/g, ' ').replace('<br>', '\n').trim()
}

export async function googleCalendar(calendarId, now = moment()) {
let calendarUrl = `https://www.googleapis.com/calendar/v3/calendars/${calendarId}/events`

Expand Down
14 changes: 14 additions & 0 deletions modules/node_modules/@frogpond/ccc-lib/html.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import {toLaxTitleCase} from '@frogpond/titlecase'

import _jsdom from 'jsdom'
const {JSDOM} = _jsdom

// Html

export function parseHtml(string) {
return JSDOM.fragment(string).textContent.trim()
}

export function innerTextWithSpaces(elem) {
return JSDOM.fragment(elem).split(/\s+/u).join(' ').trim()
}
1 change: 1 addition & 0 deletions modules/node_modules/@frogpond/ccc-lib/index.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
export {get} from './http'
export * from './cache'
export * from './url'
export * from './html'
15 changes: 8 additions & 7 deletions modules/node_modules/@frogpond/ccc-presence/index.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import {get, ONE_HOUR} from '@frogpond/ccc-lib'
import {get, ONE_HOUR, parseHtml} from '@frogpond/ccc-lib'
import mem from 'mem'
import lodash from 'lodash'
import _jsdom from 'jsdom'
import pMap from 'p-map'
const {sortBy, startCase} = lodash
const {JSDOM} = _jsdom

/*
type ContactPersonType = {
Expand Down Expand Up @@ -45,10 +43,13 @@ export function cleanOrg(org) {
// )

let category = org.categories.join(', ')
let meetings =
(org.regularMeetingLocation || '').trim() +
(org.regularMeetingTime || '').trim()
let description = JSDOM.fragment(org.description).textContent.trim()

let meetingTime = org.regularMeetingTime || ''
let meetingLocation = parseHtml(org.regularMeetingLocation || '')
let meetings = `${meetingTime} ${meetingLocation}`.trim()

let description = parseHtml(org.description)

let website = (org.website || '').trim()
if (website && !/^https?:\/\//.test(website)) {
website = `http://${website}`
Expand Down
14 changes: 8 additions & 6 deletions modules/node_modules/@frogpond/ccc-reason-calendar/index.js
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
/* eslint-disable camelcase */

import {get} from '@frogpond/ccc-lib'
import {get, parseHtml} from '@frogpond/ccc-lib'
import moment from 'moment-timezone'
import dropWhile from 'lodash/dropWhile'
import dropRightWhile from 'lodash/dropRightWhile'
import sortBy from 'lodash/sortBy'
import getUrls from 'get-urls'
import _jsdom from 'jsdom'
const {JSDOM} = _jsdom

const TZ = 'US/Central'

Expand Down Expand Up @@ -113,16 +111,16 @@ function convertReasonEvent(event, now = moment()) {
moment(event.startTime).isBefore(now, 'day') &&
moment(event.endTime).isSameOrAfter(now)

let description = (event.description || '').replace('<br>', '\n')
description = JSDOM.fragment(description).textContent.trim()
let title = parseHtml(event.name || '')
let description = cleanTextBlock(event.description || '')

let links = description ? [...getUrls(description)] : []

return {
dataSource: 'reason',
startTime: event.startTime,
endTime: event.endTime,
title: event.name || '',
title: title,
description: description,
location: event.location || '',
links: links,
Expand All @@ -138,6 +136,10 @@ function convertReasonEvent(event, now = moment()) {
}
}

function cleanTextBlock(text) {
return parseHtml(text).replace('<br>', '\n').trim()
}

export async function reasonCalendar(calendarUrl, now = moment()) {
let dateParams = {
// eslint-disable-next-line camelcase
Expand Down
8 changes: 4 additions & 4 deletions modules/node_modules/@frogpond/ccc-rss-feed/index.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import {get} from '@frogpond/ccc-lib'
import {get, parseHtml} from '@frogpond/ccc-lib'
import _jsdom from 'jsdom'
const {JSDOM} = _jsdom

Expand All @@ -25,7 +25,7 @@ export function convertRssItemToStory(item) {

let title = item.querySelector('title')
title = title ? title.textContent : '(no title)'
title = JSDOM.fragment(title).textContent.trim()
title = parseHtml(title)

let datePublished = item.querySelector('pubDate')
datePublished = datePublished ? datePublished.textContent : null
Expand All @@ -35,12 +35,12 @@ export function convertRssItemToStory(item) {
let content = item.getAttribute('content:encoded')
content = content || (descriptionEl && descriptionEl.textContent)
content = content || '(no content)'
content = JSDOM.fragment(content).textContent.trim()
content = parseHtml(content)

let excerpt = descriptionEl
? descriptionEl.textContent
: content.substr(0, 250)
excerpt = JSDOM.fragment(excerpt).textContent.trim()
excerpt = parseHtml(excerpt)

let featuredImage = null
if (item.querySelector('enclosure')) {
Expand Down
8 changes: 3 additions & 5 deletions modules/node_modules/@frogpond/ccc-wpjson-feed/index.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import {get} from '@frogpond/ccc-lib'
import _jsdom from 'jsdom'
const {JSDOM} = _jsdom
import {get, parseHtml} from '@frogpond/ccc-lib'

export async function fetchWpJson(url, query = {}) {
const feed = await get(url, {query, json: true})
Expand Down Expand Up @@ -47,10 +45,10 @@ export function convertWpJsonItemToStory(item) {
categories: categories,
content: item.content.rendered,
datePublished: item.date_gmt,
excerpt: JSDOM.fragment(item.excerpt.rendered).textContent.trim(),
excerpt: parseHtml(item.excerpt.rendered),
featuredImage: featuredImage,
link: item.link,
title: JSDOM.fragment(item.title.rendered).textContent.trim(),
title: parseHtml(item.title.rendered),
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import {get, ONE_HOUR, makeAbsoluteUrl} from '@frogpond/ccc-lib'
import {get, ONE_HOUR, makeAbsoluteUrl, parseHtml} from '@frogpond/ccc-lib'
import {fromHtml} from '@frogpond/ccc-markdown'
import mem from 'mem'
import _jsdom from 'jsdom'
Expand All @@ -9,15 +9,15 @@ const archiveBase =
'https://apps.carleton.edu/events/convocations/feeds/media_files?page_id=342645'

function processConvo(event) {
let title = JSDOM.fragment(
let title = parseHtml(
event.querySelector('title').textContent,
).textContent.trim()
)

let description = event.querySelector('description')
description = description
? JSDOM.fragment(
? parseHtml(
event.querySelector('description').textContent,
).textContent.trim()
)
: ''

let pubDate = moment(event.querySelector('pubDate').textContent)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import {get} from '@frogpond/ccc-lib'
import {get, parseHtml} from '@frogpond/ccc-lib'
import _jsdom from 'jsdom'
import lodash from 'lodash'
const {groupBy, toPairs} = lodash
Expand All @@ -13,9 +13,9 @@ export async function noonNewsBulletein() {
let bulletinEls = [...dom.window.document.querySelectorAll('item')]
let bulletins = bulletinEls.map((item) => {
let description = item.querySelector('description').textContent
description = JSDOM.fragment(description).textContent.trim()
description = parseHtml(description)
let category = item.querySelector('category').textContent
category = JSDOM.fragment(category).textContent.trim()
category = parseHtml(category)
return {description, category}
})

Expand Down
Original file line number Diff line number Diff line change
@@ -1,22 +1,19 @@
import {get, ONE_DAY} from '@frogpond/ccc-lib'
import {get, ONE_DAY, parseHtml} from '@frogpond/ccc-lib'
import mem from 'mem'
import _jsdom from 'jsdom'
import getUrls from 'get-urls'

const {JSDOM} = _jsdom

export function cleanJob(job) {
const title = parseHtml(job.title)
const office = parseHtml(job.office)
const hoursPerWeek = parseHtml(job.hoursPerWeek)
const timeOfHours = parseHtml(job.timeOfHours)

// these all need to retain their newlines
const description = cleanTextBlock(
JSDOM.fragment(job.description).textContent,
)
const comments = cleanTextBlock(JSDOM.fragment(job.comments).textContent)
const skills = cleanTextBlock(JSDOM.fragment(job.skills).textContent)
const howToApply = cleanTextBlock(JSDOM.fragment(job.howToApply).textContent)
const timeline = cleanTextBlock(JSDOM.fragment(job.timeline).textContent)
const timeOfHours = cleanTextBlock(
JSDOM.fragment(job.timeOfHours).textContent,
)
const description = cleanTextBlock(job.description)
const comments = cleanTextBlock(job.comments)
const skills = cleanTextBlock(job.skills)
const howToApply = cleanTextBlock(job.howToApply)
const timeline = cleanTextBlock(job.timeline)

const contactEmail = fixupEmailFormat(job.contactEmail)
const contactPhone = fixupPhoneFormat(job.contactPhone)
Expand All @@ -34,12 +31,12 @@ export function cleanJob(job) {
contactPhone: contactPhone,
description: description,
goodForIncomingStudents: job.goodForIncomingStudents,
hoursPerWeek: job.hoursPerWeek,
hoursPerWeek: hoursPerWeek,
howToApply: howToApply,
id: job.id,
lastModified: job.lastModified,
links: links,
office: job.office,
office: office,
openPositions: job.openPositions,
skills: skills,
timeline: timeline,
Expand All @@ -51,7 +48,7 @@ export function cleanJob(job) {
}

function cleanTextBlock(text) {
return text.replace(/\s+/g, ' ')
return parseHtml(text).replace(/\t/g, ' ').trim()
}

export function getLinksFromJob({description, comments, skills, howToApply}) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import {get, ONE_HOUR} from '@frogpond/ccc-lib'
import {get, ONE_HOUR, parseHtml} from '@frogpond/ccc-lib'
import mem from 'mem'
import moment from 'moment-timezone'

Expand All @@ -19,9 +19,15 @@ export async function getStreams({streamClass, sort, dateFrom, dateTo}) {
(resp) => resp.body,
)
const processed = data.results.map((stream) => {
let {starttime} = stream
let {starttime, title, subtitle, performer} = stream

let streamTitle = parseHtml(title)
let detail = parseHtml(subtitle || performer || '')

return {
...stream,
title: streamTitle,
subtitle: detail,
starttime: moment
.tz(starttime, 'YYYY-MM-DD HH:mm', 'America/Chicago')
.toISOString(),
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@
"test": "./scripts/smoke-test.sh"
},
"dependencies": {
"@frogpond/titlecase": "^1.0.0",
"dotenv": "10.0.0",
"esm": "3.2.25",
"get-urls": "10.0.1",
"got": "9.6.0",
"html-entities": "2.3.2",
"is-absolute-url": "3.0.3",
"jsdom": "16.6.0",
"koa": "2.13.4",
Expand Down
10 changes: 5 additions & 5 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,11 @@
minimatch "^3.0.4"
strip-json-comments "^3.1.1"

"@frogpond/titlecase@^1.0.0":
version "1.0.0"
resolved "https://registry.yarnpkg.com/@frogpond/titlecase/-/titlecase-1.0.0.tgz#993e5371c31b58a839e76fdc531053c55b1a2736"
integrity sha512-C1qKm/J+B+cXo+7+ZHbRnt2iApx/IrxMEXwOxe+ZkeTvSC1nZ2XRWf0xzFuAanpBNDhFcgrxdnR8FMKNHo1scQ==

"@gar/promisify@^1.0.1":
version "1.1.2"
resolved "https://registry.yarnpkg.com/@gar/promisify/-/promisify-1.1.2.tgz#30aa825f11d438671d585bd44e7fd564535fc210"
Expand Down Expand Up @@ -1130,11 +1135,6 @@ html-encoding-sniffer@^2.0.1:
dependencies:
whatwg-encoding "^1.0.5"

[email protected]:
version "2.3.2"
resolved "https://registry.yarnpkg.com/html-entities/-/html-entities-2.3.2.tgz#760b404685cb1d794e4f4b744332e3b00dcfe488"
integrity sha512-c3Ab/url5ksaT0WyleslpBEthOzWhrjQbg75y7XUsfSzi3Dgzt0l8w5e7DylRn15MTlMMD58dTfzddNS2kcAjQ==

http-assert@^1.3.0:
version "1.4.1"
resolved "https://registry.yarnpkg.com/http-assert/-/http-assert-1.4.1.tgz#c5f725d677aa7e873ef736199b89686cceb37878"
Expand Down