-
-
Notifications
You must be signed in to change notification settings - Fork 275
/
Copy pathextract-collective-spam-domains.js
59 lines (46 loc) · 1.66 KB
/
extract-collective-spam-domains.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import '../server/env';
import getUrls from 'get-urls';
import { union } from 'lodash';
import moment from 'moment';
import { NON_SPAMMERS_DOMAINS, resolveRedirect, SPAMMERS_DOMAINS } from '../server/lib/spam';
import models, { Op, sequelize } from '../server/models';
const domains = {};
const compareEntries = ([, countA], [, countB]) => {
return countB <= countA ? -1 : 1;
};
async function run() {
const collectives = await models.Collective.findAll({
where: {
approvedAt: { [Op.is]: null },
longDescription: { [Op.not]: null },
updatedAt: { [Op.gte]: moment().subtract(3, 'month').toDate() },
},
order: [['updatedAt', 'DESC']],
paranoid: false,
});
for (const collective of collectives) {
if (collective.data?.isBanned !== true && collective.data?.seo !== true) {
continue;
}
// console.log(collective.slug, collective.createdAt);
const content = `${collective.slug} ${collective.name} ${collective.description} ${collective.longDescription} ${collective.website}`;
const urls = getUrls(content);
for (const url of urls) {
const parsedUrl = resolveRedirect(new URL(url));
if (NON_SPAMMERS_DOMAINS.includes(parsedUrl.hostname)) {
continue;
}
if (domains[parsedUrl.hostname]) {
domains[parsedUrl.hostname]++;
} else {
domains[parsedUrl.hostname] = 1;
}
}
}
const entries = Object.entries(domains);
entries.sort(compareEntries);
const topDomains = entries.slice(0, 100).map(el => el[0]);
console.log('Updated SPAMMERS_DOMAINS = ', JSON.stringify(union(SPAMMERS_DOMAINS, topDomains).sort(), null, 2));
await sequelize.close();
}
run();