# install postgresql anonymizer extension ```bash git clone https://gitlab.com/dalibo/postgresql_anonymizer.git make extension make install ``` # take a small sample from prod database ```bash pg_sample --limit=""Item"=created_at >= now() - interval '10 days',"Donation"=created_at >= now() - interval '10 days',"Earn"=created_at >= now() - interval '10 days',"ItemAct"=created_at >= now() - interval '10 days',"Sub"=*,"SubAct"=*,_prisma_migrations=*" stackernews > sample.sql ``` # create a new database from data ```bash createdb sample psql sample < sample.sql ``` # initialize the extension we turn on the privacy by default just to make sure default values are set if we forget to mask a column ```sql ALTER DATABASE sample SET session_preload_libraries = 'anon'; ALTER DATABASE sample SET anon.privacy_by_default = true; ``` restart session # begin statically masking the data We want to keep the variety of data for development purposes, but the exactness is irrelevant. We lose quite a bit of data consistency but that shouldn't matter. We turn off triggers to make this faster. In some future version it might be worth keeping data consistency and masking other tables. ### initialize the extension ```sql -- turn off triggers to make this faster SET session_replication_role = replica; CREATE EXTENSION IF NOT EXISTS anon CASCADE; SELECT anon.init(); ``` ### drop all sensitive tables we won't need ```sql DELETE FROM pgboss.job; DELETE FROM pgboss.archive; DELETE FROM "Snl"; DELETE FROM "Wallet"; DELETE FROM "WalletLightningAddress"; DELETE FROM "WalletLND"; DELETE FROM "Mute"; DELETE FROM "Arc"; DELETE FROM "Streak"; DELETE FROM "NostrRelay"; DELETE FROM "UserNostrRelay"; DELETE FROM "ItemUpload"; DELETE FROM "Upload"; DELETE FROM "LnAuth"; DELETE FROM "LnWith"; DELETE FROM "Invite"; DELETE FROM "Message"; DELETE FROM "ItemForward"; DELETE FROM "PollOption"; DELETE FROM "PollVote"; DELETE FROM "MuteSub"; DELETE FROM "Pin"; DELETE FROM "ReferralAct"; DELETE FROM "Mention"; DELETE FROM "Invoice"; DELETE FROM "Withdrawl"; DELETE FROM "accounts"; DELETE FROM "OFAC"; DELETE FROM "sessions"; DELETE FROM "verification_requests"; DELETE FROM "Bookmark"; DELETE FROM "ThreadSubscription"; DELETE FROM "UserSubscription"; DELETE FROM "PushSubscription"; DELETE FROM "Log"; DELETE FROM "TerritoryTransfer"; ``` ### mask and shuffle the users table ```sql -- users SECURITY LABEL FOR anon ON COLUMN users.created_at IS 'MASKED WITH FUNCTION anon.random_in_tsrange(''[2021-10-01,2024-2-20]'')'; SECURITY LABEL FOR anon ON COLUMN users.updated_at IS 'MASKED WITH FUNCTION anon.random_in_tsrange(''[2021-10-01,2024-2-20]'')'; SECURITY LABEL FOR anon ON COLUMN users.msats IS 'MASKED WITH FUNCTION anon.random_in_int8range(''[0,250000000]'')'; SECURITY LABEL FOR anon ON COLUMN users."stackedMsats" IS 'MASKED WITH FUNCTION anon.random_in_int8range(''[0,2500000000]'')'; -- set masking for columns we want to mask SECURITY LABEL FOR anon ON COLUMN users.name IS 'MASKED WITH VALUE anon.fake_first_name() || anon.fake_last_name() || anon.random_string(3)'; -- set not to mask for columns we don't want to mask SECURITY LABEL FOR anon ON COLUMN users.id IS 'NOT MASKED'; SELECT anon.anonymize_table('users'); ``` ### mask other tables mostly by randomizing the userId column #### donation ```sql -- donation -- set masking for columns we want to mask SECURITY LABEL FOR anon ON COLUMN "Donation"."userId" IS 'MASKED WITH FUNCTION anon.random_in(ARRAY(SELECT id FROM public.users))'; -- set not to mask for columns we don't want to mask SECURITY LABEL FOR anon ON COLUMN "Donation".id IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "Donation".sats IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "Donation".created_at IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "Donation".updated_at IS 'NOT MASKED'; SELECT anon.anonymize_table('"Donation"'); ``` #### earn ```sql -- set masking for columns we want to mask SECURITY LABEL FOR anon ON COLUMN "Earn"."userId" IS 'MASKED WITH FUNCTION anon.random_in(ARRAY(SELECT id FROM public.users))'; -- set not to mask for columns we don't want to mask SECURITY LABEL FOR anon ON COLUMN "Earn".id IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "Earn".created_at IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "Earn".updated_at IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "Earn".msats IS 'NOT MASKED'; SELECT anon.anonymize_table('"Earn"'); ``` #### item ```sql SECURITY LABEL FOR anon ON COLUMN "Item"."userId" IS 'MASKED WITH FUNCTION anon.random_in(ARRAY(SELECT id FROM public.users))'; -- set not to mask for columns we don't want to mask SECURITY LABEL FOR anon ON COLUMN "Item".id IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "Item".created_at IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "Item".msats IS 'MASKED WITH FUNCTION anon.random_in_int8range(''[0,250000000]'')'; SECURITY LABEL FOR anon ON COLUMN "Item"."weightedVotes" IS 'MASKED WITH FUNCTION anon.random_in_numrange(''[0,30]'')'; SECURITY LABEL FOR anon ON COLUMN "Item".text IS 'MASKED WITH VALUE CASE WHEN "Item".text IS NULL THEN "Item".text ELSE anon.lorem_ipsum(characters := LENGTH("Item".text)) END'; SECURITY LABEL FOR anon ON COLUMN "Item".title IS 'MASKED WITH VALUE CASE WHEN "Item".title IS NULL THEN "Item".title ELSE anon.lorem_ipsum(characters := LENGTH("Item".title)) END'; SECURITY LABEL FOR anon ON COLUMN "Item".url IS 'MASKED WITH VALUE ''https://example.com/'''; SECURITY LABEL FOR anon ON COLUMN "Item".updated_at IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "Item".path IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "Item"."parentId" IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "Item"."subName" IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "Item"."ncomments" IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "Item"."rootId" IS 'NOT MASKED'; SELECT anon.anonymize_table('"Item"'); ``` #### itemAct ```sql SECURITY LABEL FOR anon ON COLUMN "ItemAct"."userId" IS 'MASKED WITH FUNCTION anon.random_in(ARRAY(SELECT id FROM public.users))'; -- set not to mask for columns we don't want to mask SECURITY LABEL FOR anon ON COLUMN "ItemAct"."id" IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "ItemAct"."itemId" IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "ItemAct".created_at IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "ItemAct".updated_at IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "ItemAct".act IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "ItemAct".msats IS 'NOT MASKED'; SELECT anon.anonymize_table('"ItemAct"'); ``` #### sub ```sql SECURITY LABEL FOR anon ON COLUMN "Sub"."userId" IS 'MASKED WITH FUNCTION anon.random_in(ARRAY(SELECT id FROM public.users))'; -- set not to mask for columns we don't want to mask SECURITY LABEL FOR anon ON COLUMN "Sub"."name" IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "Sub"."path" IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "Sub".created_at IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "Sub".updated_at IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "Sub"."billingType" IS 'MASKED WITH VALUE ''ONCE'''; SECURITY LABEL FOR anon ON COLUMN "Sub"."billingCost" IS 'MASKED WITH VALUE 0'; SECURITY LABEL FOR anon ON COLUMN "Sub"."rankingType" IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "Sub"."postTypes" IS 'NOT MASKED'; SELECT anon.anonymize_table('"Sub"'); ``` #### subAct ```sql SECURITY LABEL FOR anon ON COLUMN "SubAct"."userId" IS 'MASKED WITH FUNCTION anon.random_in(ARRAY(SELECT id FROM public.users))'; -- shuffle the subName column SELECT anon.shuffle_column('"SubAct"', 'subName', 'id'); SELECT anon.shuffle_column('"SubAct"', 'msats', 'id'); -- set not to mask for columns we don't want to mask SECURITY LABEL FOR anon ON COLUMN "SubAct".id IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "SubAct"."subName" IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "SubAct".type IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "SubAct".msats IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "SubAct".created_at IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN "SubAct".updated_at IS 'NOT MASKED'; SELECT anon.anonymize_table('"SubAct"'); ``` #### _prisma_migrations don't mask this table ```sql SECURITY LABEL FOR anon ON COLUMN _prisma_migrations.id IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN _prisma_migrations.checksum IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN _prisma_migrations.finished_at IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN _prisma_migrations.migration_name IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN _prisma_migrations.started_at IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN _prisma_migrations.applied_steps_count IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN _prisma_migrations.rolled_back_at IS 'NOT MASKED'; SELECT anon.anonymize_table('_prisma_migrations'); ``` #### pgboss.schedule don't mask this table ```sql SECURITY LABEL FOR anon ON COLUMN pgboss.schedule.name IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN pgboss.schedule.cron IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN pgboss.schedule.timezone IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN pgboss.schedule.data IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN pgboss.schedule.options IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN pgboss.schedule.created_on IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN pgboss.schedule.updated_on IS 'NOT MASKED'; SELECT anon.anonymize_table('pgboss.schedule'); ``` #### pgboss.version don't mask this table ```sql SECURITY LABEL FOR anon ON COLUMN pgboss.version.version IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN pgboss.version.maintained_on IS 'NOT MASKED'; SECURITY LABEL FOR anon ON COLUMN pgboss.version.cron_on IS 'NOT MASKED'; SELECT anon.anonymize_table('pgboss.version'); ``` # turn triggers back on ```sql SET session_replication_role = DEFAULT; DROP EXTENSION IF EXISTS anon CASCADE; ``` # refresh all materialized views ```sql SET search_path TO public; CREATE OR REPLACE FUNCTION public.RefreshAllMaterializedViews(schema_arg TEXT DEFAULT 'public') RETURNS INT AS $$ DECLARE r RECORD; BEGIN RAISE NOTICE 'Refreshing materialized view in schema %', schema_arg; FOR r IN SELECT matviewname FROM pg_matviews WHERE schemaname = schema_arg LOOP RAISE NOTICE 'Refreshing %.%', schema_arg, r.matviewname; EXECUTE 'REFRESH MATERIALIZED VIEW ' || schema_arg || '.' || r.matviewname; END LOOP; RETURN 1; END $$ LANGUAGE plpgsql; -- make sure materialized views are refreshed SELECT public.RefreshAllMaterializedViews(); ``` # dump it ```bash pg_dump sample --no-owner --no-security-labels > anon.sql ``` # modify search_path ```sql SELECT pg_catalog.set_config('search_path', 'public', false); ``` # set the time zone to UTC ```sql ALTER DATABASE stackernews SET timezone TO 'UTC'; ``` # modify the dump to timewarp the data at the end ```sql CREATE OR REPLACE FUNCTION timewarp() RETURNS VOID LANGUAGE plpgsql AS $$ DECLARE r RECORD; max_timestamp TIMESTAMP; interval_to_add INTERVAL; BEGIN FOR r IN SELECT c.table_schema, c.table_name, c.column_name FROM information_schema.columns c JOIN information_schema.tables t ON c.table_schema = t.table_schema AND c.table_name = t.table_name WHERE c.data_type IN ('timestamp without time zone', 'timestamp with time zone') AND c.table_schema NOT IN ('pg_catalog', 'information_schema') -- Exclude system schemas AND t.table_type = 'BASE TABLE' -- Ensure targeting only user-defined tables (excluding views) AND t.table_schema NOT LIKE 'pg_%' -- Exclude other potential PostgreSQL system schemas LOOP -- Calculate the maximum value in the column EXECUTE format('SELECT max(%I) FROM %I.%I', r.column_name, r.table_schema, r.table_name) INTO max_timestamp; -- If there's a maximum value, calculate the interval and update the column IF max_timestamp IS NOT NULL THEN interval_to_add := now() - max_timestamp; EXECUTE format('UPDATE %I.%I SET %I = %I + %L', r.table_schema, r.table_name, r.column_name, r.column_name, interval_to_add); END IF; END LOOP; END; $$; SELECT timewarp(); ``` # fix denormalized comment stuff ```sql UPDATE "Item" p SET (ncomments, "commentMsats") = (SELECT COALESCE(count(*), 0), COALESCE(sum(msats), 0) FROM "Item" c WHERE c.path <@ p.path AND p.id <> c.id); ``` # index all the tables ```sql INSERT INTO pgboss.job (name) VALUES ('indexAllItems'); ```