add db seed to sndev

This commit is contained in:
keyan 2024-03-07 15:19:49 -06:00
parent b3bf3dba28
commit af2890075c
3 changed files with 163017 additions and 78 deletions

1
.gitignore vendored
View File

@ -20,6 +20,7 @@ node_modules/
.DS_Store .DS_Store
*.pem *.pem
/*.sql /*.sql
!/anon.sql
lnbits/ lnbits/
# debug # debug

162659
anon.sql Normal file

File diff suppressed because one or more lines are too long

View File

@ -1,116 +1,395 @@
# install postgresql anonymizer extension
```bash
git clone https://gitlab.com/dalibo/postgresql_anonymizer.git git clone https://gitlab.com/dalibo/postgresql_anonymizer.git
make extension make extension
make install make install
```
-- todo need a function to modify foreign key distribution # take a small sample from prod database
-- like randomly assign a valid foreign key to a row ```bash
pg_sample --limit=""Item"=created_at >= now() - interval '10 days',"Donation"=created_at >= now() - interval '10 days',"Earn"=created_at >= now() - interval '10 days',"ItemAct"=created_at >= now() - interval '10 days',"Sub"=*,"SubAct"=*,_prisma_migrations=*" stackernews > sample.sql
```
anon.random_in(SELECT id FROM users) # create a new database from data
```bash
createdb sample
psql sample < sample.sql
```
# initialize the extension
we turn on the privacy by default just to make sure default values are set if we forget to mask a column
```sql
ALTER DATABASE sample SET session_preload_libraries = 'anon'; ALTER DATABASE sample SET session_preload_libraries = 'anon';
ALTER DATABASE sample SET anon.privacy_by_default = true; ALTER DATABASE sample SET anon.privacy_by_default = true;
-- restart session ```
-- turn off triggers
restart session
# begin statically masking the data
We want to keep the variety of data for development purposes, but the exactness is irrelevant. We lose quite a bit of data consistency but that shouldn't matter. We turn off triggers to make this faster.
In some future version it might be worth keeping data consistency and masking other tables.
### initialize the extension
```sql
-- turn off triggers to make this faster
SET session_replication_role = replica; SET session_replication_role = replica;
CREATE EXTENSION IF NOT EXISTS anon CASCADE; CREATE EXTENSION IF NOT EXISTS anon CASCADE;
SELECT anon.init(); SELECT anon.init();
```
-- basically we dont want to mask the following columns ... preferring instead to shuffle the data ### drop all sensitive tables we won't need
-- and in some cases introduce noise ```sql
DELETE FROM pgboss.job;
DELETE FROM pgboss.archive;
DELETE FROM "Snl";
DELETE FROM "Wallet";
DELETE FROM "WalletLightningAddress";
DELETE FROM "WalletLND";
DELETE FROM "Mute";
DELETE FROM "Arc";
DELETE FROM "Streak";
DELETE FROM "NostrRelay";
DELETE FROM "UserNostrRelay";
DELETE FROM "ItemUpload";
DELETE FROM "Upload";
DELETE FROM "LnAuth";
DELETE FROM "LnWith";
DELETE FROM "Invite";
DELETE FROM "Message";
DELETE FROM "ItemForward";
DELETE FROM "PollOption";
DELETE FROM "PollVote";
DELETE FROM "MuteSub";
DELETE FROM "Pin";
DELETE FROM "ReferralAct";
DELETE FROM "Mention";
DELETE FROM "Invoice";
DELETE FROM "Withdrawl";
DELETE FROM "accounts";
DELETE FROM "OFAC";
DELETE FROM "sessions";
DELETE FROM "verification_requests";
DELETE FROM "Bookmark";
DELETE FROM "ThreadSubscription";
DELETE FROM "UserSubscription";
DELETE FROM "PushSubscription";
DELETE FROM "Log";
DELETE FROM "TerritoryTransfer";
```
### mask and shuffle the users table
```sql
-- users -- users
SELECT anon.shuffle_column('users', 'created_at', 'id'); SECURITY LABEL FOR anon ON COLUMN users.created_at
SELECT anon.shuffle_column('users', 'updated_at', 'id'); IS 'MASKED WITH FUNCTION anon.random_in_tsrange(''[2021-10-01,2024-2-20]'')';
SELECT anon.shuffle_column('users', 'lastSeenAt', 'id'); SECURITY LABEL FOR anon ON COLUMN users.updated_at
SELECT anon.shuffle_column('users', 'inviteId', 'id'); IS 'MASKED WITH FUNCTION anon.random_in_tsrange(''[2021-10-01,2024-2-20]'')';
SELECT anon.shuffle_column('users', 'referrerId', 'id'); SECURITY LABEL FOR anon ON COLUMN users.msats
SELECT anon.shuffle_column('users', 'msats', 'id'); IS 'MASKED WITH FUNCTION anon.random_in_int8range(''[0,250000000]'')';
SELECT anon.shuffle_column('users', 'stackedMsats', 'id'); SECURITY LABEL FOR anon ON COLUMN users."stackedMsats"
SELECT anon.shuffle_column('users', 'bioId', 'id'); IS 'MASKED WITH FUNCTION anon.random_in_int8range(''[0,2500000000]'')';
-- introduce noise on these columns
SELECT anon.add_noise_on_numeric_column('users', 'msats', 1);
SELECT anon.add_noise_on_numeric_column('users', 'stackedMsats', 1);
-- set masking for columns we want to mask -- set masking for columns we want to mask
SECURITY LABEL FOR anon ON COLUMN users.name SECURITY LABEL FOR anon ON COLUMN users.name
IS 'MASKED WITH VALUE anon.fake_first_name() || anon.fake_last_name() || anon.random_string(3)'; IS 'MASKED WITH VALUE anon.fake_first_name() || anon.fake_last_name() || anon.random_string(3)';
-- set not to mask for columns we don't want to mask -- set not to mask for columns we don't want to mask
SECURITY LABEL FOR anon ON COLUMN users.created_at SECURITY LABEL FOR anon ON COLUMN users.id
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN users.updated_at
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN users."lastSeenAt"
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN users."inviteId"
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN users."referrerId"
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN users.msats
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN users."stackedMsats"
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN users."bioId"
IS 'NOT MASKED'; IS 'NOT MASKED';
SELECT anon.anonymize_table('users');
```
select ### mask other tables mostly by randomizing the userId column
#### donation
```sql
-- donation -- donation
SELECT anon.shuffle_column('"Donation"', 'created_at', 'id');
SELECT anon.shuffle_column('"Donation"', 'updated_at', 'id');
-- introduce noise on these columns
SELECT anon.add_noise_on_numeric_column('"Donation"', 'sats', 1);
-- set masking for columns we want to mask -- set masking for columns we want to mask
SECURITY LABEL FOR anon ON COLUMN "Donation"."userId" SECURITY LABEL FOR anon ON COLUMN "Donation"."userId"
IS 'MASKED WITH FUNCTION anon.random_in(ARRAY(SELECT id FROM users))'; IS 'MASKED WITH FUNCTION anon.random_in(ARRAY(SELECT id FROM public.users))';
-- set not to mask for columns we don't want to mask -- set not to mask for columns we don't want to mask
SECURITY LABEL FOR anon ON COLUMN "Donation".id
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN "Donation".sats
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN "Donation".created_at SECURITY LABEL FOR anon ON COLUMN "Donation".created_at
IS 'NOT MASKED'; IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN "Donation".updated_at SECURITY LABEL FOR anon ON COLUMN "Donation".updated_at
IS 'NOT MASKED'; IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN users."lastSeenAt" SELECT anon.anonymize_table('"Donation"');
```
#### earn
```sql
-- set masking for columns we want to mask
SECURITY LABEL FOR anon ON COLUMN "Earn"."userId"
IS 'MASKED WITH FUNCTION anon.random_in(ARRAY(SELECT id FROM public.users))';
-- set not to mask for columns we don't want to mask
SECURITY LABEL FOR anon ON COLUMN "Earn".id
IS 'NOT MASKED'; IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN "Earn".created_at
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN "Earn".updated_at
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN "Earn".msats
IS 'NOT MASKED';
SELECT anon.anonymize_table('"Earn"');
```
#### item
```sql
SECURITY LABEL FOR anon ON COLUMN "Item"."userId"
IS 'MASKED WITH FUNCTION anon.random_in(ARRAY(SELECT id FROM public.users))';
-- set not to mask for columns we don't want to mask
SECURITY LABEL FOR anon ON COLUMN "Item".id
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN "Item".created_at
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN "Item".msats
IS 'MASKED WITH FUNCTION anon.random_in_int8range(''[0,250000000]'')';
SECURITY LABEL FOR anon ON COLUMN "Item"."weightedVotes"
IS 'MASKED WITH FUNCTION anon.random_in_numrange(''[0,30]'')';
SECURITY LABEL FOR anon ON COLUMN "Item".text
IS 'MASKED WITH VALUE CASE WHEN "Item".text IS NULL
THEN "Item".text
ELSE anon.lorem_ipsum(characters := LENGTH("Item".text))
END';
SECURITY LABEL FOR anon ON COLUMN "Item".title
IS 'MASKED WITH VALUE CASE WHEN "Item".title IS NULL
THEN "Item".title
ELSE anon.lorem_ipsum(characters := LENGTH("Item".title))
END';
SECURITY LABEL FOR anon ON COLUMN "Item".url
IS 'MASKED WITH VALUE ''https://example.com/''';
SECURITY LABEL FOR anon ON COLUMN "Item".updated_at
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN "Item".path
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN "Item"."parentId"
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN "Item"."subName"
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN "Item"."ncomments"
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN "Item"."rootId"
IS 'NOT MASKED';
SELECT anon.anonymize_table('"Item"');
```
#### itemAct
```sql
SECURITY LABEL FOR anon ON COLUMN "ItemAct"."userId"
IS 'MASKED WITH FUNCTION anon.random_in(ARRAY(SELECT id FROM public.users))';
-- set not to mask for columns we don't want to mask
SECURITY LABEL FOR anon ON COLUMN "ItemAct"."id"
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN "ItemAct"."itemId"
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN "ItemAct".created_at
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN "ItemAct".updated_at
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN "ItemAct".act
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN "ItemAct".msats
IS 'NOT MASKED';
SELECT anon.anonymize_table('"ItemAct"');
```
-- earn #### sub
-- Invite ```sql
-- Item SECURITY LABEL FOR anon ON COLUMN "Sub"."userId"
-- ItemAct IS 'MASKED WITH FUNCTION anon.random_in(ARRAY(SELECT id FROM public.users))';
-- ItemForward -- set not to mask for columns we don't want to mask
-- PollOption SECURITY LABEL FOR anon ON COLUMN "Sub"."name"
-- PollVote IS 'NOT MASKED';
-- Sub SECURITY LABEL FOR anon ON COLUMN "Sub"."path"
-- SubAct IS 'NOT MASKED';
-- Pin SECURITY LABEL FOR anon ON COLUMN "Sub".created_at
-- ReferralAct IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN "Sub".updated_at
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN "Sub"."billingType"
IS 'MASKED WITH VALUE ''ONCE''';
SECURITY LABEL FOR anon ON COLUMN "Sub"."billingCost"
IS 'MASKED WITH VALUE 0';
SECURITY LABEL FOR anon ON COLUMN "Sub"."rankingType"
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN "Sub"."postTypes"
IS 'NOT MASKED';
SELECT anon.anonymize_table('"Sub"');
```
-- truncate tables that are irrelevant for local development #### subAct
TRUNCATE TABLE "Wallet"; ```sql
TRUNCATE TABLE "WalletLightningAddress"; SECURITY LABEL FOR anon ON COLUMN "SubAct"."userId"
TRUNCATE TABLE "WalletLND"; IS 'MASKED WITH FUNCTION anon.random_in(ARRAY(SELECT id FROM public.users))';
TRUNCATE TABLE "Mute"; -- shuffle the subName column
TRUNCATE TABLE "Arc"; SELECT anon.shuffle_column('"SubAct"', 'subName', 'id');
TRUNCATE TABLE "Streak"; SELECT anon.shuffle_column('"SubAct"', 'msats', 'id');
TRUNCATE TABLE "NostrRelay"; -- set not to mask for columns we don't want to mask
TRUNCATE TABLE "UserNostrRelay"; SECURITY LABEL FOR anon ON COLUMN "SubAct".id
TRUNCATE TABLE "LNAuth"; IS 'NOT MASKED';
TRUNCATE TABLE "LnWith"; SECURITY LABEL FOR anon ON COLUMN "SubAct"."subName"
TRUNCATE TABLE "Message"; IS 'NOT MASKED';
TRUNCATE TABLE "MuteSub"; SECURITY LABEL FOR anon ON COLUMN "SubAct".type
TRUNCATE TABLE "Mention"; IS 'NOT MASKED';
TRUNCATE TABLE "Invoice"; SECURITY LABEL FOR anon ON COLUMN "SubAct".msats
TRUNCATE TABLE "Withdrawal"; IS 'NOT MASKED';
TRUNCATE TABLE "Account"; SECURITY LABEL FOR anon ON COLUMN "SubAct".created_at
TRUNCATE TABLE "OFAC"; IS 'NOT MASKED';
TRUNCATE TABLE "Session"; SECURITY LABEL FOR anon ON COLUMN "SubAct".updated_at
TRUNCATE TABLE "VerificationToken"; IS 'NOT MASKED';
TRUNCATE TABLE "ThreadSubscription"; SELECT anon.anonymize_table('"SubAct"');
TRUNCATE TABLE "UserSubscription"; ```
TRUNCATE TABLE "PushSubscription";
TRUNCATE TABLE "Log";
SELECT anon.add_noise_on_datetime_column('users', 'created_at', '1 year'); #### _prisma_migrations
SELECT anon.add_noise_on_datetime_column('users', 'updated_at', '1 year');
don't mask this table
```sql
SECURITY LABEL FOR anon ON COLUMN _prisma_migrations.id
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN _prisma_migrations.checksum
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN _prisma_migrations.finished_at
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN _prisma_migrations.migration_name
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN _prisma_migrations.started_at
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN _prisma_migrations.applied_steps_count
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN _prisma_migrations.rolled_back_at
IS 'NOT MASKED';
SELECT anon.anonymize_table('_prisma_migrations');
```
#### pgboss.schedule
SELECT anon.shuffle_column('"Item"', 'userId', 'id'); don't mask this table
```sql
SECURITY LABEL FOR anon ON COLUMN pgboss.schedule.name
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN pgboss.schedule.cron
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN pgboss.schedule.timezone
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN pgboss.schedule.data
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN pgboss.schedule.options
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN pgboss.schedule.created_on
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN pgboss.schedule.updated_on
IS 'NOT MASKED';
SELECT anon.anonymize_table('pgboss.schedule');
```
#### pgboss.version
don't mask this table
```sql
SECURITY LABEL FOR anon ON COLUMN pgboss.version.version
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN pgboss.version.maintained_on
IS 'NOT MASKED';
SECURITY LABEL FOR anon ON COLUMN pgboss.version.cron_on
IS 'NOT MASKED';
SELECT anon.anonymize_table('pgboss.version');
```
# turn triggers back on
```sql
SET session_replication_role = DEFAULT;
DROP EXTENSION IF EXISTS anon CASCADE;
```
# refresh all materialized views
```sql
SET search_path TO public;
CREATE OR REPLACE FUNCTION public.RefreshAllMaterializedViews(schema_arg TEXT DEFAULT 'public')
RETURNS INT AS $$
DECLARE
r RECORD;
BEGIN
RAISE NOTICE 'Refreshing materialized view in schema %', schema_arg;
FOR r IN SELECT matviewname FROM pg_matviews WHERE schemaname = schema_arg
LOOP
RAISE NOTICE 'Refreshing %.%', schema_arg, r.matviewname;
EXECUTE 'REFRESH MATERIALIZED VIEW ' || schema_arg || '.' || r.matviewname;
END LOOP;
RETURN 1;
END
$$ LANGUAGE plpgsql;
-- make sure materialized views are refreshed
SELECT public.RefreshAllMaterializedViews();
```
# dump it
```bash
pg_dump sample --no-owner --no-security-labels > anon.sql
```
# modify search_path
```sql
SELECT pg_catalog.set_config('search_path', 'public', false);
```
# set the time zone to UTC
```sql
ALTER DATABASE stackernews SET timezone TO 'UTC';
```
# modify the dump to timewarp the data at the end
```sql
CREATE OR REPLACE FUNCTION timewarp()
RETURNS VOID LANGUAGE plpgsql AS $$
DECLARE
r RECORD;
max_timestamp TIMESTAMP;
interval_to_add INTERVAL;
BEGIN
FOR r IN SELECT c.table_schema, c.table_name, c.column_name
FROM information_schema.columns c
JOIN information_schema.tables t ON c.table_schema = t.table_schema AND c.table_name = t.table_name
WHERE c.data_type IN ('timestamp without time zone', 'timestamp with time zone')
AND c.table_schema NOT IN ('pg_catalog', 'information_schema') -- Exclude system schemas
AND t.table_type = 'BASE TABLE' -- Ensure targeting only user-defined tables (excluding views)
AND t.table_schema NOT LIKE 'pg_%' -- Exclude other potential PostgreSQL system schemas
LOOP
-- Calculate the maximum value in the column
EXECUTE format('SELECT max(%I) FROM %I.%I', r.column_name, r.table_schema, r.table_name) INTO max_timestamp;
-- If there's a maximum value, calculate the interval and update the column
IF max_timestamp IS NOT NULL THEN
interval_to_add := now() - max_timestamp;
EXECUTE format('UPDATE %I.%I SET %I = %I + %L', r.table_schema, r.table_name, r.column_name, r.column_name, interval_to_add);
END IF;
END LOOP;
END;
$$;
SELECT timewarp();
```
# fix denormalized comment stuff
```sql
UPDATE "Item" p SET (ncomments, "commentMsats") =
(SELECT COALESCE(count(*), 0), COALESCE(sum(msats), 0)
FROM "Item" c
WHERE c.path <@ p.path AND p.id <> c.id);
```
# index all the tables
```sql
INSERT INTO pgboss.job (name) VALUES ('indexAllItems');
```