Merge pull request #781 from mzivil/fix-hn-and-bitcointalk-dupes

Fix hacker news and bitcointalk dupes
This commit is contained in:
Keyan 2024-02-02 15:16:15 -06:00 committed by GitHub
commit cb5c12b82d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 17 additions and 12 deletions

View File

@ -1,5 +1,5 @@
import { GraphQLError } from 'graphql'
import { ensureProtocol, removeTracking } from '../../lib/url'
import { ensureProtocol, removeTracking, stripTrailingSlash } from '../../lib/url'
import serialize, { serializeInvoicable } from './serial'
import { decodeCursor, LIMIT, nextCursorEncoded } from '../../lib/cursor'
import { getMetadata, metadataRuleSets } from 'page-metadata-parser'
@ -540,29 +540,30 @@ export default {
},
dupes: async (parent, { url }, { me, models }) => {
const urlObj = new URL(ensureProtocol(url))
let { hostname, pathname } = urlObj
const { hostname, pathname } = urlObj
hostname = hostname + '(:[0-9]+)?'
let hostnameRegex = hostname + '(:[0-9]+)?'
const parseResult = parse(urlObj.hostname)
if (parseResult?.subdomain?.length) {
const { subdomain } = parseResult
hostname = hostname.replace(subdomain, '(%)?')
hostnameRegex = hostnameRegex.replace(subdomain, '(%)?')
} else {
hostname = `(%.)?${hostname}`
hostnameRegex = `(%.)?${hostnameRegex}`
}
// escape postgres regex meta characters
pathname = pathname.replace(/\+/g, '\\+')
pathname = pathname.replace(/%/g, '\\%')
pathname = pathname.replace(/_/g, '\\_')
let pathnameRegex = pathname.replace(/\+/g, '\\+')
pathnameRegex = pathnameRegex.replace(/%/g, '\\%')
pathnameRegex = pathnameRegex.replace(/_/g, '\\_')
let uri = hostname + pathname
uri = uri.endsWith('/') ? uri.slice(0, -1) : uri
const uriRegex = stripTrailingSlash(hostnameRegex + pathnameRegex)
let similar = `(http(s)?://)?${uri}/?`
let similar = `(http(s)?://)?${uriRegex}/?`
const whitelist = ['news.ycombinator.com/item', 'bitcointalk.org/index.php']
const youtube = ['www.youtube.com', 'youtube.com', 'm.youtube.com', 'youtu.be']
if (whitelist.includes(uri)) {
const hostAndPath = stripTrailingSlash(urlObj.hostname + urlObj.pathname)
if (whitelist.includes(hostAndPath)) {
similar += `\\${urlObj.search}`
} else if (youtube.includes(urlObj.hostname)) {
// extract id and create both links

View File

@ -23,6 +23,10 @@ export function removeTracking (value) {
return value
}
export function stripTrailingSlash (uri) {
return uri.endsWith('/') ? uri.slice(0, -1) : uri
}
// eslint-disable-next-line
export const URL_REGEXP = /^((https?|ftp):\/\/)?(www.)?(((([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(%[\da-f]{2})|[!\$&'\(\)\*\+,;=]|:)*@)?(((\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5])\.(\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5])\.(\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5])\.(\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5]))|((([a-z]|\d|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(([a-z]|\d|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])*([a-z]|\d|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])))\.)+(([a-z]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(([a-z]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])*([a-z]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])))\.?)(:\d*)?)(\/((([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(%[\da-f]{2})|[!\$&'\(\)\*\+,;=]|:|@)+(\/(([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(%[\da-f]{2})|[!\$&'\(\)\*\+,;=]|:|@)*)*)?)?(\?((([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(%[\da-f]{2})|[!\$&'\(\)\*\+,;=]|:|@)|[\uE000-\uF8FF]|\/|\?)*)?(\#((([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(%[\da-f]{2})|[!\$&'\(\)\*\+,;=]|:|@)|\/|\?)*)?$/i