From 72b8b5b63465ea649b4762e06ce530b2a0ba2d4b Mon Sep 17 00:00:00 2001
From: rleed <101502594+rleed@users.noreply.github.com>
Date: Fri, 20 Oct 2023 21:23:50 -0300
Subject: [PATCH] Attempt to get publication year when auto-titling links
 (#520)

* port date scraper from python

* bug fixes and cleanup

* bug fixes and cleanup

* refactor

* address comments

* make it intuitive

* Update timedate-scraper.js - lint

* address review comments

* cleanup

* simplfy and don't use side effects

---------

Co-authored-by: rleed <rleed1@pm.me>
---
 api/resolvers/item.js   |  8 +++-
 lib/timedate-scraper.js | 88 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 1 deletion(-)
 create mode 100644 lib/timedate-scraper.js

diff --git a/api/resolvers/item.js b/api/resolvers/item.js
index 249f4931..6648dbfd 100644
--- a/api/resolvers/item.js
+++ b/api/resolvers/item.js
@@ -3,6 +3,7 @@ import { ensureProtocol, removeTracking } from '../../lib/url'
 import { serializeInvoicable } from './serial'
 import { decodeCursor, LIMIT, nextCursorEncoded } from '../../lib/cursor'
 import { getMetadata, metadataRuleSets } from 'page-metadata-parser'
+import { ruleSet as publicationDateRuleSet } from '../../lib/timedate-scraper'
 import domino from 'domino'
 import {
   ITEM_SPAM_INTERVAL, ITEM_FILTER_THRESHOLD,
@@ -17,6 +18,7 @@ import { advSchema, amountSchema, bountySchema, commentSchema, discussionSchema,
 import { sendUserNotification } from '../webPush'
 import { defaultCommentSort, isJob } from '../../lib/item'
 import { notifyItemParents, notifyUserSubscribers, notifyZapped } from '../../lib/push-notifications'
+import { datePivot } from '../../lib/time'
 
 export async function commentFilterClause (me, models) {
   let clause = ` AND ("Item"."weightedVotes" - "Item"."weightedDownVotes" > -${ITEM_FILTER_THRESHOLD}`
@@ -540,8 +542,12 @@ export default {
         const response = await fetch(ensureProtocol(url), { redirect: 'follow' })
         const html = await response.text()
         const doc = domino.createWindow(html).document
-        const metadata = getMetadata(doc, url, { title: metadataRuleSets.title })
+        const metadata = getMetadata(doc, url, { title: metadataRuleSets.title, publicationDate: publicationDateRuleSet })
+        const dateHint = ` (${metadata.publicationDate?.getFullYear()})`
+        const moreThanOneYearAgo = metadata.publicationDate && metadata.publicationDate < datePivot(new Date(), { years: -1 })
+
         res.title = metadata?.title
+        if (moreThanOneYearAgo) res.title += dateHint
       } catch { }
 
       try {
diff --git a/lib/timedate-scraper.js b/lib/timedate-scraper.js
new file mode 100644
index 00000000..0774b05f
--- /dev/null
+++ b/lib/timedate-scraper.js
@@ -0,0 +1,88 @@
+// Date rule for use with page-metadata-parser.
+// Based on https://github.com/Webhose/article-date-extractor/blob/master/articleDateExtractor/__init__.py
+// Usage: import ruleSet and use in a call similar to: getMetadata(doc, url, { publicationDate: ruleSet.publicationDate })
+// Some example URLs for testing purposes:
+
+// ld+json example from 2018:
+// https://mhagemann.medium.com/how-to-add-structured-json-ld-data-to-nuxt-js-8bb5f7c8a2d
+
+// meta tag example from 2011:
+// https://www.newyorker.com/magazine/2011/10/10/the-crypto-currency
+
+// YouTube example from 2018:
+// https://www.youtube.com/watch?v=YSUVRj8iznU
+
+// A news article from 2023 (< 1 year, should not display a date):
+// https://edition.cnn.com/politics/live-news/matt-gaetz-kevin-mccarthy-house-speakership-10-03-23/index.html
+
+function cleanDateStr (dateString) {
+  try {
+    return new Date(dateString.match(String.raw`[0-9].*[0-9]`)[0])
+  } catch {}
+}
+
+export function extractFromURL (url) {
+  // Regex by Newspaper3k  - https://github.com/codelucas/newspaper/blob/master/newspaper/urls.py
+  const m = url.match(String.raw`([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})?`)
+  if (m) {
+    return cleanDateStr(m[0])
+  }
+}
+
+function asDate (str) {
+  if (str) {
+    try {
+      const d = new Date(str)
+      if (!isNaN(d)) return d
+    } catch { }
+  }
+}
+
+export const ruleSet = {
+  // note meta names are case sensitive, and scorers must not favor rules when they will not return good results.
+  rules: [
+    ['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.datePublished)],
+    ['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.dateCreated)],
+    ['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.dateModified)],
+
+    ['meta[property="article:published_time"]', node => asDate(node.getAttribute('content'))],
+    ['meta[name="pubdate"]', node => asDate(node.getAttribute('content'))],
+    ['meta[name="publishdate"]', node => asDate(node.getAttribute('content'))],
+    ['meta[name="timestamp"]', node => asDate(node.getAttribute('content'))],
+    ['meta[name="dc.date.issued"]', node => asDate(node.getAttribute('content'))],
+    ['meta[name="date"]', node => asDate(node.getAttribute('content'))],
+    ['meta[property="bt:pubdate"]', node => asDate(node.getAttribute('content'))],
+    ['meta[name="parsely-pub-date"]', node => asDate(node.getAttribute('content'))],
+    ['meta[name="sailthru.date"]', node => asDate(node.getAttribute('content'))],
+    ['meta[name="article.published"]', node => asDate(node.getAttribute('content'))],
+    ['meta[name="published-date"]', node => asDate(node.getAttribute('content'))],
+    ['meta[name="article.created"]', node => asDate(node.getAttribute('content'))],
+    ['meta[name="article_date_original"]', node => asDate(node.getAttribute('content'))],
+    ['meta[name="cxenseparse:recs:publishtime"]', node => asDate(node.getAttribute('content'))],
+    ['meta[name="date_published"]', node => asDate(node.getAttribute('content'))],
+    ['meta[itemprop="datePublished"]', node => asDate(node.getAttribute('content'))],
+    ['meta[itemprop="datepublished"]', node => asDate(node.getAttribute('content'))],
+    ['meta[itemprop="datecreated"]', node => asDate(node.getAttribute('content'))],
+    ['meta[http-equiv="date"]', node => asDate(node.getAttribute('content'))],
+    ['meta[property="og:image"]', node => asDate(extractFromURL(node.getAttribute('content')))],
+    ['meta[itemprop="image"]', node => asDate(extractFromURL(node.getAttribute('content')))],
+
+    ['time', node => asDate(node.getAttribute('datetime') || (node.getAttribute('class') === 'timestamp' && node.innerHTML))],
+    ['span[itemprop="datePublished"]', node => asDate(node.getAttribute('content') || cleanDateStr(node.innerHTML))],
+    ...['span', 'p', 'div'].map(tag => {
+      return ['pubdate', 'timestamp', 'article_date', 'articledate', 'date'].map(className => {
+        return [`${tag}[class="${className}"]`, node => asDate(cleanDateStr(node.innerHTML))]
+      })
+    }).flat()
+  ],
+  scorers: [
+    (el, score) => {
+      if (el.localName === 'script' && el.getAttribute('type') === 'application/ld+json' && el.innerHTML) {
+        const data = JSON.parse(el.innerHTML)
+        return data?.datePublished || data?.dateCreated || data?.dateModified ? 1000000 + score : 0
+      }
+    },
+    (el, score) => el.localName === 'meta' && el.getAttribute('content') && cleanDateStr(el.getAttribute('content')) ? 1000 + score : 0,
+    (el, score) => !['script', 'meta'].includes(el.localName) ? score : 0
+  ]
+}