rleed 72b8b5b634
Attempt to get publication year when auto-titling links ()
* port date scraper from python

* bug fixes and cleanup

* bug fixes and cleanup

* refactor

* address comments

* make it intuitive

* Update timedate-scraper.js - lint

* address review comments

* cleanup

* simplfy and don't use side effects


Co-authored-by: rleed <>
2023-10-20 19:23:50 -05:00

89 lines
4.6 KiB

// Date rule for use with page-metadata-parser.
// Based on
// Usage: import ruleSet and use in a call similar to: getMetadata(doc, url, { publicationDate: ruleSet.publicationDate })
// Some example URLs for testing purposes:
// ld+json example from 2018:
// meta tag example from 2011:
// YouTube example from 2018:
// A news article from 2023 (< 1 year, should not display a date):
function cleanDateStr (dateString) {
try {
return new Date(dateString.match(String.raw`[0-9].*[0-9]`)[0])
} catch {}
export function extractFromURL (url) {
// Regex by Newspaper3k -
const m = url.match(String.raw`([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})?`)
if (m) {
return cleanDateStr(m[0])
function asDate (str) {
if (str) {
try {
const d = new Date(str)
if (!isNaN(d)) return d
} catch { }
export const ruleSet = {
// note meta names are case sensitive, and scorers must not favor rules when they will not return good results.
rules: [
['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.datePublished)],
['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.dateCreated)],
['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.dateModified)],
['meta[property="article:published_time"]', node => asDate(node.getAttribute('content'))],
['meta[name="pubdate"]', node => asDate(node.getAttribute('content'))],
['meta[name="publishdate"]', node => asDate(node.getAttribute('content'))],
['meta[name="timestamp"]', node => asDate(node.getAttribute('content'))],
['meta[name=""]', node => asDate(node.getAttribute('content'))],
['meta[name="date"]', node => asDate(node.getAttribute('content'))],
['meta[property="bt:pubdate"]', node => asDate(node.getAttribute('content'))],
['meta[name="parsely-pub-date"]', node => asDate(node.getAttribute('content'))],
['meta[name=""]', node => asDate(node.getAttribute('content'))],
['meta[name="article.published"]', node => asDate(node.getAttribute('content'))],
['meta[name="published-date"]', node => asDate(node.getAttribute('content'))],
['meta[name="article.created"]', node => asDate(node.getAttribute('content'))],
['meta[name="article_date_original"]', node => asDate(node.getAttribute('content'))],
['meta[name="cxenseparse:recs:publishtime"]', node => asDate(node.getAttribute('content'))],
['meta[name="date_published"]', node => asDate(node.getAttribute('content'))],
['meta[itemprop="datePublished"]', node => asDate(node.getAttribute('content'))],
['meta[itemprop="datepublished"]', node => asDate(node.getAttribute('content'))],
['meta[itemprop="datecreated"]', node => asDate(node.getAttribute('content'))],
['meta[http-equiv="date"]', node => asDate(node.getAttribute('content'))],
['meta[property="og:image"]', node => asDate(extractFromURL(node.getAttribute('content')))],
['meta[itemprop="image"]', node => asDate(extractFromURL(node.getAttribute('content')))],
['time', node => asDate(node.getAttribute('datetime') || (node.getAttribute('class') === 'timestamp' && node.innerHTML))],
['span[itemprop="datePublished"]', node => asDate(node.getAttribute('content') || cleanDateStr(node.innerHTML))],
...['span', 'p', 'div'].map(tag => {
return ['pubdate', 'timestamp', 'article_date', 'articledate', 'date'].map(className => {
return [`${tag}[class="${className}"]`, node => asDate(cleanDateStr(node.innerHTML))]
scorers: [
(el, score) => {
if (el.localName === 'script' && el.getAttribute('type') === 'application/ld+json' && el.innerHTML) {
const data = JSON.parse(el.innerHTML)
return data?.datePublished || data?.dateCreated || data?.dateModified ? 1000000 + score : 0
(el, score) => el.localName === 'meta' && el.getAttribute('content') && cleanDateStr(el.getAttribute('content')) ? 1000 + score : 0,
(el, score) => !['script', 'meta'].includes(el.localName) ? score : 0