Fix the check for misleading links ()

* Fix the check for misleading links

* replace tabs with spaces

* remove trailing spaces

* move isMisleadingLinks to lib/url.js and create unit tests

* Add comments to test cases

* URLs can contain hyphens

---------

Co-authored-by: ekzyis <ek@stacker.news>
This commit is contained in:
Edward Kung 2025-02-14 07:43:08 -08:00 committed by GitHub
parent 77781e07ed
commit 15bd1c3fc5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 54 additions and 20 deletions

@ -1,5 +1,5 @@
import { SKIP, visit } from 'unist-util-visit'
import { parseEmbedUrl, parseInternalLinks } from './url'
import { parseEmbedUrl, parseInternalLinks, isMisleadingLink } from './url'
import { slug } from 'github-slugger'
import { toString } from 'mdast-util-to-string'
@ -255,22 +255,6 @@ export default function rehypeSN (options = {}) {
}
}
function isMisleadingLink (text, href) {
let misleading = false
if (/^\s*(\w+\.)+\w+/.test(text)) {
try {
const hrefUrl = new URL(href)
if (new URL(hrefUrl.protocol + text).origin !== hrefUrl.origin) {
misleading = true
}
} catch {}
}
return misleading
}
function replaceNostrId (value, id) {
return {
type: 'element',

@ -241,6 +241,29 @@ export function decodeProxyUrl (imgproxyUrl) {
return originalUrl
}
export function isMisleadingLink (text, href) {
let misleading = false
try {
const hrefUrl = new URL(href)
try {
const textUrl = new URL(text)
if (textUrl.origin !== hrefUrl.origin) {
misleading = true
}
} catch {}
if (/^\s*([\w-]+\.)+\w+/.test(text)) {
if (new URL(hrefUrl.protocol + text).origin !== hrefUrl.origin) {
misleading = true
}
}
} catch {}
return misleading
}
// eslint-disable-next-line
export const URL_REGEXP = /^((https?|ftp):\/\/)?(www.)?(((([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(%[\da-f]{2})|[!\$&'\(\)\*\+,;=]|:)*@)?(((\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5])\.(\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5])\.(\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5])\.(\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5]))|((([a-z]|\d|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(([a-z]|\d|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])*([a-z]|\d|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])))\.)+(([a-z]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(([a-z]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])*([a-z]|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])))\.?)(:\d*)?)(\/((([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(%[\da-f]{2})|[!\$&'\(\)\*\+,;=]|:|@)+(\/(([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(%[\da-f]{2})|[!\$&'\(\)\*\+,;=]|:|@)*)*)?)?(\?((([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(%[\da-f]{2})|[!\$&'\(\)\*\+,;=]|:|@)|[\uE000-\uF8FF]|\/|\?)*)?(\#((([a-z]|\d|-|\.|_|~|[\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF])|(%[\da-f]{2})|[!\$&'\(\)\*\+,;=]|:|@)|\/|\?)*)?$/i

@ -1,8 +1,8 @@
/* eslint-env jest */
import { parseInternalLinks } from './url.js'
import { parseInternalLinks, isMisleadingLink } from './url.js'
const cases = [
const internalLinkCases = [
['https://stacker.news/items/123', '#123'],
['https://stacker.news/items/123/related', '#123/related'],
// invalid links should not be parsed so user can spot error
@ -20,7 +20,7 @@ const cases = [
]
describe('internal links', () => {
test.each(cases)(
test.each(internalLinkCases)(
'parses %p as %p',
(href, expected) => {
process.env.NEXT_PUBLIC_URL = 'https://stacker.news'
@ -29,3 +29,30 @@ describe('internal links', () => {
}
)
})
const misleadingLinkCases = [
// if text is the same as the link, it's not misleading
['https://stacker.news/items/1234', 'https://stacker.news/items/1234', false],
// same origin is not misleading
['https://stacker.news/items/1235', 'https://stacker.news/items/1234', false],
['www.google.com', 'https://www.google.com', false],
['stacker.news', 'https://stacker.news', false],
// if text is obviously not a link, it's not misleading
['innocent text', 'https://stacker.news/items/1234', false],
['innocenttext', 'https://stacker.news/items/1234', false],
// if text might be a link to a different origin, it's misleading
['innocent.text', 'https://stacker.news/items/1234', true],
['https://google.com', 'https://bing.com', true],
['www.google.com', 'https://bing.com', true],
['s-tacker.news', 'https://snacker.news', true]
]
describe('misleading links', () => {
test.each(misleadingLinkCases)(
'identifies [%p](%p) as misleading: %p',
(text, href, expected) => {
const actual = isMisleadingLink(text, href)
expect(actual).toBe(expected)
}
)
})