use capture microservice

This commit is contained in:
keyan 2023-07-15 11:17:16 -05:00
parent d1ed72bb85
commit 61c64646b5
12 changed files with 2741 additions and 13975 deletions

5
.gitignore vendored
View File

@ -1,7 +1,7 @@
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files. # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
# dependencies # dependencies
/node_modules node_modules/
/.pnp /.pnp
.pnp.js .pnp.js
.cache .cache
@ -42,9 +42,6 @@ envbak
!.elasticbeanstalk/*.cfg.yml !.elasticbeanstalk/*.cfg.yml
!.elasticbeanstalk/*.global.yml !.elasticbeanstalk/*.global.yml
# copilot
copilot/
# service worker # service worker
public/sw.js* public/sw.js*
sw/precache-manifest.json sw/precache-manifest.json

1
capture/.dockerignore Normal file
View File

@ -0,0 +1 @@
node_modules

15
capture/Dockerfile Normal file
View File

@ -0,0 +1,15 @@
FROM ghcr.io/puppeteer/puppeteer:18.2.1
EXPOSE 5678
USER root
WORKDIR /home/pptruser
ENV PUPPETEER_SKIP_DOWNLOAD true
COPY ./package*.json ./
RUN npm ci
COPY . .
CMD [ "node", "index.js" ]
USER pptruser

60
capture/index.js Normal file
View File

@ -0,0 +1,60 @@
import express from 'express'
import puppeteer from 'puppeteer'
const captureUrl = process.env.CAPTURE_URL || 'http://host.docker.internal:3000/'
const port = process.env.PORT || 5678
const maxPages = process.env.MAX_PAGES || 5
const timeout = process.env.TIMEOUT || 10000
const cache = process.env.CACHE || 60000
const width = process.env.WIDTH || 600
const height = process.env.HEIGHT || 315
const deviceScaleFactor = process.env.SCALE_FACTOR || 2
let browser
const app = express()
app.get('/health', (req, res) => {
res.status(200).end()
})
app.get('/*', async (req, res) => {
browser ||= await puppeteer.launch({
headless: 'new',
executablePath: 'google-chrome-stable',
args: ['--no-sandbox', '--disable-setuid-sandbox']
})
const url = new URL(req.originalUrl, captureUrl)
console.time(url.href)
console.timeLog(url.href, 'capturing', 'current pages', (await browser.pages()).length)
// limit number of active pages
if ((await browser.pages()).length > maxPages + 1) {
console.timeLog(url.href, 'too many pages')
console.timeEnd(url.href)
return res.writeHead(503, {
'Retry-After': 1
}).end()
}
let page
try {
page = await browser.newPage()
await page.setViewport({ width, height, deviceScaleFactor })
await page.emulateMediaFeatures([{ name: 'prefers-color-scheme', value: 'dark' }])
await page.goto(url.href, { waitUntil: 'load', timeout })
const file = await page.screenshot({ type: 'png', captureBeyondViewport: false })
res.setHeader('Content-Type', 'image/png')
res.setHeader('Cache-Control', `public, max-age=${cache}, immutable`)
res.status(200).end(file)
} catch (err) {
console.log(err)
return res.status(500).end()
} finally {
console.timeEnd(url.href)
page?.close()
}
})
app.listen(port, () =>
console.log(`Screenshot listen on http://:${port}`)
)

2637
capture/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

16
capture/package.json Normal file
View File

@ -0,0 +1,16 @@
{
"name": "capture",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "ISC",
"dependencies": {
"express": "^4.18.2",
"puppeteer": "^20.8.2"
},
"type": "module"
}

View File

@ -18,7 +18,7 @@ export function SeoSearch ({ sub }) {
description: desc, description: desc,
images: [ images: [
{ {
url: 'https://stacker.news/api/capture' + router.asPath url: 'https://capture.stacker.news' + router.asPath
} }
], ],
site_name: 'Stacker News' site_name: 'Stacker News'
@ -81,7 +81,7 @@ export default function Seo ({ sub, item, user }) {
description: desc, description: desc,
images: [ images: [
{ {
url: 'https://stacker.news/api/capture' + pathNoQuery url: 'https://capture.stacker.news' + pathNoQuery
} }
], ],
site_name: 'Stacker News' site_name: 'Stacker News'

13879
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -57,7 +57,6 @@
"nprogress": "^0.2.0", "nprogress": "^0.2.0",
"opentimestamps": "^0.4.9", "opentimestamps": "^0.4.9",
"page-metadata-parser": "^1.1.4", "page-metadata-parser": "^1.1.4",
"pageres": "^7.1.0",
"pg-boss": "^9.0.3", "pg-boss": "^9.0.3",
"piexifjs": "^1.0.6", "piexifjs": "^1.0.6",
"prisma": "^5.4.2", "prisma": "^5.4.2",

View File

@ -1,78 +0,0 @@
import path from 'path'
import AWS from 'aws-sdk'
import { PassThrough } from 'stream'
import { datePivot } from '../../../lib/time'
const { spawn } = require('child_process')
const encodeS3URI = require('node-s3-url-encode')
const bucketName = 'sn-capture'
const bucketRegion = 'us-east-1'
const contentType = 'image/png'
const bucketUrl = 'https://sn-capture.s3.amazonaws.com/'
const s3PathPrefix = process.env.NODE_ENV === 'development' ? 'dev/' : ''
let capturing = false
AWS.config.update({
region: bucketRegion
})
export default async function handler (req, res) {
return new Promise(resolve => {
const joinedPath = path.join(...(req.query.path || []))
const searchQ = req.query.q ? `?q=${req.query.q}` : ''
const s3PathPUT = s3PathPrefix + (joinedPath === '.' ? '_' : joinedPath) + searchQ
const s3PathGET = s3PathPrefix + (joinedPath === '.' ? '_' : joinedPath) + encodeS3URI(searchQ)
const url = process.env.PUBLIC_URL + '/' + joinedPath + searchQ
const aws = new AWS.S3({ apiVersion: '2006-03-01' })
// check to see if we have a recent version of the object
aws.headObject({
Bucket: bucketName,
Key: s3PathPUT,
IfModifiedSince: datePivot(new Date(), { minutes: -15 })
}).promise().then(() => {
// this path is cached so return it
res.writeHead(302, { Location: bucketUrl + s3PathGET }).end()
resolve()
}).catch(() => {
// we don't have it cached, so capture it and cache it
if (capturing) {
return res.writeHead(503, {
'Retry-After': 1
}).end()
}
capturing = true
const pass = new PassThrough()
aws.upload({
Bucket: bucketName,
Key: s3PathPUT,
ACL: 'public-read',
Body: pass,
ContentType: contentType
}).promise().catch(console.log)
res.setHeader('Content-Type', contentType)
const capture = spawn(
'node', ['./spawn/capture.js', url], { maxBuffer: 1024 * 1024 * 5 })
capture.on('close', code => {
if (code !== 0) {
res.status(500).end()
} else {
res.status(200).end()
}
pass.end()
capture.removeAllListeners()
capturing = false
resolve()
})
capture.on('error', err => console.log('error', err))
capture.stderr.on('data', data => console.log('error stderr', data.toString()))
capture.stdout.on('data', data => {
res.write(data)
pass.write(data)
})
})
})
}

View File

@ -1,17 +0,0 @@
#!/usr/bin/node
import Pageres from 'pageres'
async function captureUrl () {
try {
const streams = await new Pageres({ crop: true, scale: 2, timeout: 10, launchOptions: { args: ['--single-process'] } })
.source(process.argv[2], ['600x315'])
.run()
process.stdout.write(streams[0], () => process.exit(0))
} catch (e) {
console.log(e)
process.exit(1)
}
}
captureUrl()

View File

@ -1,3 +0,0 @@
{
"type": "module"
}