stacker.news/scripts/nostr-link-extract.js

544 lines
17 KiB
JavaScript
Raw Normal View History

2025-02-27 11:58:02 -06:00
const WebSocket = require('ws') // You might need to install this: npm install ws
const { nip19 } = require('nostr-tools') // Keep this for formatting
const fs = require('fs')
const path = require('path')
2025-02-27 11:58:02 -06:00
// ANSI color codes
const colors = {
reset: '\x1b[0m',
bright: '\x1b[1m',
dim: '\x1b[2m',
underscore: '\x1b[4m',
blink: '\x1b[5m',
reverse: '\x1b[7m',
hidden: '\x1b[8m',
fg: {
black: '\x1b[30m',
red: '\x1b[31m',
green: '\x1b[32m',
yellow: '\x1b[33m',
blue: '\x1b[34m',
magenta: '\x1b[35m',
cyan: '\x1b[36m',
white: '\x1b[37m',
gray: '\x1b[90m',
crimson: '\x1b[38m'
},
bg: {
black: '\x1b[40m',
red: '\x1b[41m',
green: '\x1b[42m',
yellow: '\x1b[43m',
blue: '\x1b[44m',
magenta: '\x1b[45m',
cyan: '\x1b[46m',
white: '\x1b[47m',
gray: '\x1b[100m',
crimson: '\x1b[48m'
}
}
// Default configuration
let config = {
userPubkeys: [],
ignorePubkeys: [],
timeIntervalHours: 12,
verbosity: 'normal', // Can be 'minimal', 'normal', or 'debug'
relayUrls: [
'wss://relay.nostr.band',
'wss://relay.primal.net',
'wss://relay.damus.io'
],
batchSize: 100,
mediaPatterns: [
{
type: 'extensions',
patterns: ['\\.jpg$', '\\.jpeg$', '\\.png$', '\\.gif$', '\\.bmp$', '\\.webp$', '\\.tiff$', '\\.ico$',
'\\.mp4$', '\\.webm$', '\\.mov$', '\\.avi$', '\\.mkv$', '\\.flv$', '\\.wmv$',
'\\.mp3$', '\\.wav$', '\\.ogg$', '\\.flac$', '\\.aac$', '\\.m4a$']
},
{
type: 'domains',
patterns: [
'nostr\\.build\\/[ai]\\/\\w+',
'i\\.imgur\\.com\\/\\w+',
'i\\.ibb\\.co\\/\\w+\\/',
'tenor\\.com\\/view\\/',
'giphy\\.com\\/gifs\\/',
'soundcloud\\.com\\/',
'spotify\\.com\\/',
'fountain\\.fm\\/'
]
}
]
}
/**
* Logger utility that respects the configured verbosity level
*/
const logger = {
// Always show error messages
error: (message) => {
console.error(`${colors.fg.red}Error: ${message}${colors.reset}`)
},
// Minimal essential info - always show regardless of verbosity
info: (message) => {
console.log(`${colors.fg.green}${message}${colors.reset}`)
},
// Progress updates - show in normal and debug modes
progress: (message) => {
if (config.verbosity !== 'minimal') {
console.log(`${colors.fg.blue}${message}${colors.reset}`)
}
},
// Detailed debug info - only show in debug mode
debug: (message) => {
if (config.verbosity === 'debug') {
console.log(`${colors.fg.gray}${message}${colors.reset}`)
}
},
// Results info - formatted differently for clarity
result: (message) => {
console.log(`${colors.bright}${colors.fg.green}${message}${colors.reset}`)
}
}
2025-02-27 11:58:02 -06:00
/**
* Load configuration from a JSON file
* @param {String} configPath - Path to the config file
* @returns {Object} - Configuration object
*/
function loadConfig (configPath) {
try {
const configData = fs.readFileSync(configPath, 'utf8')
const loadedConfig = JSON.parse(configData)
// Merge with default config to ensure all properties exist
return { ...config, ...loadedConfig }
} catch (error) {
logger.error(`Error loading config file: ${error.message}`)
logger.info('Using default configuration')
return config
}
}
/**
* Checks if a URL is a media file or hosted on a media platform based on configured patterns
2025-02-27 11:58:02 -06:00
* @param {String} url - URL to check
* @returns {Boolean} - true if it's likely a media URL
*/
function isMediaUrl (url) {
// Check for media patterns from config
if (config.mediaPatterns) {
for (const patternGroup of config.mediaPatterns) {
for (const pattern of patternGroup.patterns) {
const regex = new RegExp(pattern, 'i')
if (regex.test(url)) return true
}
}
2025-02-27 11:58:02 -06:00
}
return false
}
/**
* Fetches events from Nostr relays using WebSockets
* @param {Array} relayUrls - Array of relay URLs
* @param {Object} filter - Nostr filter object
* @param {Number} timeoutMs - Timeout in milliseconds
* @returns {Promise<Array>} - Array of events matching the filter
*/
async function fetchEvents (relayUrls, filter, timeoutMs = 10000) {
logger.debug(`Fetching events with filter: ${JSON.stringify(filter)}`)
2025-02-27 11:58:02 -06:00
const events = []
for (const url of relayUrls) {
try {
const ws = new WebSocket(url)
const relayEvents = await new Promise((resolve, reject) => {
const timeout = setTimeout(() => {
ws.close()
resolve([]) // Resolve with empty array on timeout
}, timeoutMs)
const localEvents = []
ws.on('open', () => {
// Create a unique request ID
const requestId = `req${Math.floor(Math.random() * 10000)}`
// Format and send the request
const request = JSON.stringify(['REQ', requestId, filter])
ws.send(request)
ws.on('message', (data) => {
try {
const message = JSON.parse(data.toString())
// Check if it's an EVENT message
if (message[0] === 'EVENT' && message[2]) {
localEvents.push(message[2])
} else if (message[0] === 'EOSE') {
clearTimeout(timeout)
ws.close()
resolve(localEvents)
}
} catch (error) {
logger.debug(`Error parsing message: ${error.message}`)
2025-02-27 11:58:02 -06:00
}
})
})
ws.on('error', (error) => {
logger.debug(`WebSocket error for ${url}: ${error.message}`)
2025-02-27 11:58:02 -06:00
clearTimeout(timeout)
resolve([]) // Resolve with empty array on error
})
ws.on('close', () => {
clearTimeout(timeout)
resolve(localEvents)
})
})
logger.debug(`Got ${relayEvents.length} events from ${url}`)
2025-02-27 11:58:02 -06:00
events.push(...relayEvents)
} catch (error) {
logger.debug(`Error connecting to ${url}: ${error.message}`)
2025-02-27 11:58:02 -06:00
}
}
// Remove duplicates based on event ID
const uniqueEvents = {}
events.forEach(event => {
if (!uniqueEvents[event.id]) {
uniqueEvents[event.id] = event
}
})
return Object.values(uniqueEvents)
}
/**
* Get Nostr notes from followings of specified users that contain external links
* and were posted within the specified time interval.
*
* @param {Array} userPubkeys - Array of Nostr user public keys
* @param {Number} timeIntervalHours - Number of hours to look back from now
* @param {Array} relayUrls - Array of Nostr relay URLs
* @param {Array} ignorePubkeys - Array of pubkeys to ignore (optional)
2025-02-27 11:58:02 -06:00
* @returns {Promise<Array>} - Array of note objects containing external links within the time interval
*/
async function getNotesWithLinks (userPubkeys, timeIntervalHours, relayUrls, ignorePubkeys = []) {
2025-02-27 11:58:02 -06:00
// Calculate the cutoff time in seconds (Nostr uses UNIX timestamp)
const now = Math.floor(Date.now() / 1000)
const cutoffTime = now - (timeIntervalHours * 60 * 60)
const allNotesWithLinks = []
const allFollowedPubkeys = new Set() // To collect all followed pubkeys
const ignoreSet = new Set(ignorePubkeys) // Convert ignore list to Set for efficient lookups
if (ignoreSet.size > 0) {
logger.debug(`Ignoring ${ignoreSet.size} author(s) as requested`)
}
2025-02-27 11:58:02 -06:00
logger.info(`Fetching follow lists for ${userPubkeys.length} users...`)
2025-02-27 11:58:02 -06:00
// First get the followings for each user
for (const pubkey of userPubkeys) {
try {
// Skip if this pubkey is in the ignore list
if (ignoreSet.has(pubkey)) {
logger.debug(`Skipping user ${pubkey} as it's in the ignore list`)
continue
}
logger.debug(`Fetching follow list for ${pubkey} from ${relayUrls.length} relays...`)
2025-02-27 11:58:02 -06:00
// Get the most recent contact list (kind 3)
const followListEvents = await fetchEvents(relayUrls, {
kinds: [3],
authors: [pubkey]
})
if (followListEvents.length === 0) {
logger.debug(`No follow list found for user ${pubkey}. Verify this pubkey has contacts on these relays.`)
2025-02-27 11:58:02 -06:00
continue
}
// Find the most recent follow list event
const latestFollowList = followListEvents.reduce((latest, event) =>
!latest || event.created_at > latest.created_at ? event : latest, null)
if (!latestFollowList) {
logger.debug(`No valid follow list found for user ${pubkey}`)
2025-02-27 11:58:02 -06:00
continue
}
logger.debug(`Found follow list created at: ${new Date(latestFollowList.created_at * 1000).toISOString()}`)
2025-02-27 11:58:02 -06:00
// Check if tags property exists
if (!latestFollowList.tags) {
logger.debug(`No tags found in follow list for user ${pubkey}`)
logger.debug('Follow list data:', JSON.stringify(latestFollowList, null, 2))
2025-02-27 11:58:02 -06:00
continue
}
// Extract followed pubkeys from the follow list (tag type 'p')
const followedPubkeys = latestFollowList.tags
.filter(tag => tag[0] === 'p')
.map(tag => tag[1])
.filter(pk => !ignoreSet.has(pk)) // Filter out pubkeys from the ignore list
2025-02-27 11:58:02 -06:00
if (!followedPubkeys || followedPubkeys.length === 0) {
logger.debug(`No followed users found for user ${pubkey} (after filtering ignore list)`)
2025-02-27 11:58:02 -06:00
continue
}
// Add all followed pubkeys to our set
followedPubkeys.forEach(pk => allFollowedPubkeys.add(pk))
logger.debug(`Added ${followedPubkeys.length} followed users for ${pubkey} (total: ${allFollowedPubkeys.size})`)
2025-02-27 11:58:02 -06:00
} catch (error) {
logger.error(`Error processing user ${pubkey}: ${error}`)
2025-02-27 11:58:02 -06:00
}
}
// If we found any followed pubkeys, fetch their notes in batches
2025-02-27 11:58:02 -06:00
if (allFollowedPubkeys.size > 0) {
// Convert Set to Array for the filter
const followedPubkeysArray = Array.from(allFollowedPubkeys)
const batchSize = config.batchSize || 100 // Use config batch size or default to 100
const totalBatches = Math.ceil(followedPubkeysArray.length / batchSize)
2025-02-27 11:58:02 -06:00
logger.progress(`Processing ${followedPubkeysArray.length} followed users in ${totalBatches} batches...`)
2025-02-27 11:58:02 -06:00
// Process in batches
for (let batchNum = 0; batchNum < totalBatches; batchNum++) {
const start = batchNum * batchSize
const end = Math.min(start + batchSize, followedPubkeysArray.length)
const batch = followedPubkeysArray.slice(start, end)
2025-02-27 11:58:02 -06:00
logger.progress(`Fetching batch ${batchNum + 1}/${totalBatches} (${batch.length} authors)...`)
2025-02-27 11:58:02 -06:00
// Fetch notes from the current batch of users
const notes = await fetchEvents(relayUrls, {
kinds: [1],
authors: batch,
since: cutoffTime
}, 30000) // Use a longer timeout for this larger query
2025-02-27 11:58:02 -06:00
logger.debug(`Retrieved ${notes.length} notes from batch ${batchNum + 1}`)
2025-02-27 11:58:02 -06:00
// Filter notes that have URLs (excluding notes with only media URLs)
const notesWithUrls = notes.filter(note => {
// Extract all URLs from content
const urlRegex = /(https?:\/\/[^\s]+)/g
const matches = note.content.match(urlRegex) || []
2025-02-27 11:58:02 -06:00
if (matches.length === 0) return false // No URLs at all
2025-02-27 11:58:02 -06:00
// Check if any URL is not a media file
const hasNonMediaUrl = matches.some(url => !isMediaUrl(url))
2025-02-27 11:58:02 -06:00
return hasNonMediaUrl
2025-02-27 11:58:02 -06:00
})
logger.debug(`Found ${notesWithUrls.length} notes containing non-media URLs in batch ${batchNum + 1}`)
2025-02-27 11:58:02 -06:00
// Get all unique authors from the filtered notes in this batch
const authorsWithUrls = new Set(notesWithUrls.map(note => note.pubkey))
// Fetch metadata for all relevant authors in this batch
if (authorsWithUrls.size > 0) {
logger.debug(`Fetching metadata for ${authorsWithUrls.size} authors from batch ${batchNum + 1}...`)
const allMetadata = await fetchEvents(relayUrls, {
kinds: [0],
authors: Array.from(authorsWithUrls)
})
// Create a map of author pubkey to their latest metadata
const metadataByAuthor = {}
allMetadata.forEach(meta => {
if (!metadataByAuthor[meta.pubkey] || meta.created_at > metadataByAuthor[meta.pubkey].created_at) {
metadataByAuthor[meta.pubkey] = meta
}
})
// Attach metadata to notes
for (const note of notesWithUrls) {
if (metadataByAuthor[note.pubkey]) {
try {
const metadata = JSON.parse(metadataByAuthor[note.pubkey].content)
note.userMetadata = metadata
} catch (e) {
logger.debug(`Error parsing metadata for ${note.pubkey}: ${e.message}`)
}
2025-02-27 11:58:02 -06:00
}
}
}
// Add all notes with URLs from this batch to our results
allNotesWithLinks.push(...notesWithUrls)
// Show incremental progress during batch processing
if (allNotesWithLinks.length > 0 && batchNum < totalBatches - 1) {
logger.progress(`Found ${allNotesWithLinks.length} notes with links so far...`)
}
2025-02-27 11:58:02 -06:00
}
logger.progress(`Completed processing all ${totalBatches} batches`)
2025-02-27 11:58:02 -06:00
}
return allNotesWithLinks
}
/**
* Format the notes for display with colorful output
*
* @param {Array} notes - Array of note objects
* @returns {String} - Formatted string with note information
*/
function formatNoteOutput (notes) {
const output = []
for (const note of notes) {
// Get note ID as npub
const noteId = nip19.noteEncode(note.id)
const pubkey = nip19.npubEncode(note.pubkey)
// Get user display name or fall back to npub
const userName = note.userMetadata
? (note.userMetadata.display_name || note.userMetadata.name || pubkey)
: pubkey
// Get timestamp as readable date
const timestamp = new Date(note.created_at * 1000).toISOString()
// Extract URLs from content, marking media URLs with colors
const urlRegex = /(https?:\/\/[^\s]+)/g
const matches = note.content.match(urlRegex) || []
// Format URLs with colors
const markedUrls = matches.map(url => {
const isMedia = isMediaUrl(url)
if (isMedia) {
return `${colors.fg.gray}${url}${colors.reset} (media)`
} else {
return `${colors.bright}${colors.fg.cyan}${url}${colors.reset}`
}
})
// Format output with colors
output.push(`${colors.bright}${colors.fg.yellow}Note by ${colors.fg.magenta}${userName}${colors.fg.yellow} at ${timestamp}${colors.reset}`)
output.push(`${colors.fg.green}Note ID: ${colors.reset}${noteId}`)
output.push(`${colors.fg.green}Pubkey: ${colors.reset}${pubkey}`)
// Add links with a heading
output.push(`${colors.bright}${colors.fg.blue}External URLs:${colors.reset}`)
markedUrls.forEach(url => {
output.push(`${url}`)
})
// Add content with a heading
output.push(`${colors.bright}${colors.fg.blue}Note content:${colors.reset}`)
// Colorize any links in content when displaying
let coloredContent = note.content
for (const url of matches) {
const isMedia = isMediaUrl(url)
const colorCode = isMedia ? colors.fg.gray : colors.bright + colors.fg.cyan
coloredContent = coloredContent.replace(
new RegExp(escapeRegExp(url), 'g'),
`${colorCode}${url}${colors.reset}`
)
}
output.push(coloredContent)
output.push(`${colors.fg.yellow}${'-'.repeat(50)}${colors.reset}`)
}
return output.join('\n')
}
/**
* Escape special characters for use in a regular expression
* @param {String} string - String to escape
* @returns {String} - Escaped string
*/
function escapeRegExp (string) {
return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
}
/**
* Convert a pubkey from npub to hex format if needed
* @param {String} key - Pubkey in either npub or hex format
* @returns {String} - Pubkey in hex format
*/
function normalizeToHexPubkey (key) {
// If it's an npub, decode it
if (typeof key === 'string' && key.startsWith('npub1')) {
try {
const { type, data } = nip19.decode(key)
if (type === 'npub') {
return data
}
} catch (e) {
logger.error(`Error decoding npub ${key}: ${e.message}`)
}
}
// Otherwise assume it's already in hex format
return key
}
2025-02-27 11:58:02 -06:00
/**
* Main function to execute the script
*/
async function main () {
// Load configuration from file
const configPath = path.join(__dirname, 'nostr-link-extract.config.json')
logger.info(`Loading configuration from ${configPath}`)
config = loadConfig(configPath)
2025-02-27 11:58:02 -06:00
try {
logger.info(`Starting Nostr link extraction (time interval: ${config.timeIntervalHours} hours)`)
// Convert any npub format keys to hex
const hexUserPubkeys = config.userPubkeys.map(normalizeToHexPubkey)
const hexIgnorePubkeys = config.ignorePubkeys.map(normalizeToHexPubkey)
// Log the conversion for clarity (helpful for debugging)
if (config.userPubkeys.some(key => key.startsWith('npub1'))) {
logger.debug('Converted user npubs to hex format for Nostr protocol')
}
if (config.ignorePubkeys.some(key => key.startsWith('npub1'))) {
logger.debug('Converted ignore list npubs to hex format for Nostr protocol')
}
const notesWithLinks = await getNotesWithLinks(
hexUserPubkeys,
config.timeIntervalHours,
config.relayUrls,
hexIgnorePubkeys
)
2025-02-27 11:58:02 -06:00
if (notesWithLinks.length > 0) {
const formattedOutput = formatNoteOutput(notesWithLinks)
console.log(formattedOutput)
logger.result(`Total notes with links: ${notesWithLinks.length}`)
2025-02-27 11:58:02 -06:00
} else {
logger.info('No notes with links found in the specified time interval.')
2025-02-27 11:58:02 -06:00
}
} catch (error) {
logger.error(`${error}`)
2025-02-27 11:58:02 -06:00
}
}
// Execute the script
main()