Received: from malur.postgresql.org ([217.196.149.56]) by arkaria.postgresql.org with esmtps (TLS1.3) tls TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384 (Exim 4.94.2) (envelope-from ) id 1rt8Gx-00CS0d-0b for pgsql-general@arkaria.postgresql.org; Sat, 06 Apr 2024 15:48:03 +0000 Received: from localhost ([127.0.0.1] helo=malur.postgresql.org) by malur.postgresql.org with esmtp (Exim 4.94.2) (envelope-from ) id 1rt8Gw-00DaEa-09 for pgsql-general@arkaria.postgresql.org; Sat, 06 Apr 2024 15:48:02 +0000 Received: from makus.postgresql.org ([2001:4800:3e1:1::229]) by malur.postgresql.org with esmtps (TLS1.3) tls TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384 (Exim 4.94.2) (envelope-from ) id 1rt8Gv-00DaEM-DH for pgsql-general@lists.postgresql.org; Sat, 06 Apr 2024 15:48:01 +0000 Received: from mail-qv1-xf2d.google.com ([2607:f8b0:4864:20::f2d]) by makus.postgresql.org with esmtps (TLS1.3) tls TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 (Exim 4.94.2) (envelope-from ) id 1rt8Gs-001Dsr-4X for pgsql-general@lists.postgresql.org; Sat, 06 Apr 2024 15:48:00 +0000 Received: by mail-qv1-xf2d.google.com with SMTP id 6a1803df08f44-699308ac84aso16519296d6.0 for ; Sat, 06 Apr 2024 08:47:58 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20230601; t=1712418477; x=1713023277; darn=lists.postgresql.org; h=to:subject:message-id:date:from:in-reply-to:references:mime-version :from:to:cc:subject:date:message-id:reply-to; bh=rtJ3ixXe8TKsFCDDXo70Ve9RrhZmPqmGcW1iTCOnS3A=; b=MmA6Bmq5oowlL/176jemxrjoyDWWWe1PiOSckaU9IqQkRlcFIrxGK6PK81tMJJ+6Um MGSf5tdsxc/VZsbGa7zReStcQJ710GVAFisnhHvEIsUNRsv2vT2A8NuR5X8rDkSATv7E sb6KsFZm7Mcien6fWA61Dt3ZdNv3wRYQp2MWGDozfjysrmthjFl3BnQSRy0tQqH4YP61 rtily1Fr+1yv6D6NbRR8RBhTzpL5mo99A0JnalKbAO3ZyhkmqpFgx4PcZr+vC5OKIMwT 9H8f2MYvv/85en/qH3yglFPoBb9qXmwZzjCIA/RfUQYLO3cwtbNgmL6hTcm1vFpXgA+p O0kQ== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1712418477; x=1713023277; h=to:subject:message-id:date:from:in-reply-to:references:mime-version :x-gm-message-state:from:to:cc:subject:date:message-id:reply-to; bh=rtJ3ixXe8TKsFCDDXo70Ve9RrhZmPqmGcW1iTCOnS3A=; b=Xr2PPz+uaRjMnyT2GgzeYmFwPbmz5bijwk6kp9SfTuBQPn2EB9LKkDtnKCdnpeFVql CkrjtZZyvf4Y0j0PW3BagzR74w+gkBmYonXkp9mNZwnJ+HOl5N5lGOmWyWvrr5pQnx46 UVhwpvV4wb0gdhU7AgFED+bnhsd/mCsmzPEQilkGCe+ZOrNNTscUR1xXcHxkpoz1rDiC k/glk6FUkGQxlD9AGjuL3gmeyBFsKw70rujdnwdtDIDQ8pJsXkKaNHHcTisCJ0CJKJ3A YcNGkkh85HCKpo/Uq7wevMTkxac4x6xHIgc4j2Y5KpnpG1KFT0IRLSFVxl2lOGFhh2ok 4Aag== X-Forwarded-Encrypted: i=1; AJvYcCWWmlvbHWIsx4jR9/UEJXAiltQYgxPWjMSyraoTxRgPlmasBWiSW+brY2V+T86vG3QlNhd16zNOmnABufEFt9GchPdBvmjyFR6Qi/pPyVm0dGhz X-Gm-Message-State: AOJu0YyLBneVACx1oYs4gvlwdhIE408EIXTM1vzmhfCE9emd0Z4FVYRf kHOFzYaDAvS/mkWEkuYtlJBemPvWro3Y1o1C4rXumKdaCraTlk/EKgwvGaVbVZFrHRnUxrFYm8h 0LoqNzRjHuxbkZcE+nbalrJU0G2o= X-Google-Smtp-Source: AGHT+IHV+YRIZDlQnEcdDUfpychKlm1Bn4PSCLKmAAliEBVXDPIYog7BmS8+5yhT21TuYUukAGgd490mou9roSUDsPU= X-Received: by 2002:ad4:5de7:0:b0:699:2835:46b9 with SMTP id jn7-20020ad45de7000000b00699283546b9mr4397162qvb.33.1712418477180; Sat, 06 Apr 2024 08:47:57 -0700 (PDT) MIME-Version: 1.0 References: In-Reply-To: From: yudhi s Date: Sat, 6 Apr 2024 21:17:45 +0530 Message-ID: Subject: Re: Moving delta data faster To: veem v , Adrian Klaver , Greg Sabino Mullane , pgsql-general Content-Type: multipart/alternative; boundary="0000000000000a0e8306156f7fb6" List-Id: List-Help: List-Subscribe: List-Post: List-Owner: List-Archive: Archived-At: Precedence: bulk --0000000000000a0e8306156f7fb6 Content-Type: text/plain; charset="UTF-8" Thank you Adrian, Greg and Veem. I tried writing a small routine to see how the performance differs in these four approaches i.e. Upsert VS traditional update+insert VS Merge vs Truncate+load. Initially I was thinking Upsert will perform the same as Merge as the logic looks similar but it seems it's the worst performing among all, not sure why , yet to know the reason though. Truncate+ load seems to be the best performing among all. Hope i am doing it correctly. Please correct me if I'm wrong. UPSERT approach execution time: *00:00:20.921343* UPSERT approach rows: 1000000 insert/update approach execution time: *00:00:15.53612* insert/update approach update rows : 500000 insert/update approach Insert rows: 500000 MERGE approach execution time: *00:00:14.884623* MERGE approach rows: 1000000 truncate load approach execution time:* 00:00:07.428826* truncate load rows: 1000000 ********* Routine ******** **************** UPSERT Testcase *********** drop table source_tab; drop table target_tab; CREATE TABLE source_tab ( id SERIAL PRIMARY KEY, column1 VARCHAR(100), column2 VARCHAR(100) ); -- Create target table CREATE TABLE target_tab ( id SERIAL PRIMARY KEY, column1 VARCHAR(100), column2 VARCHAR(100) ); INSERT INTO source_tab (column1, column2) SELECT 'Value ' || i, 'Value ' || (i * 2) FROM generate_series(1, 1000000) AS i; INSERT INTO target_tab (column1, column2) SELECT 'Value ' || i, 'Value ' || (i * 2) FROM generate_series(1, 500000) AS i; DO $$ DECLARE start_time timestamp; end_time timestamp; rows_inserted integer:=0; rows_updated integer:=0; rows_upserted integer:=0; rows_merged integer:=0; BEGIN -- Measure performance of UPSERT start_time := clock_timestamp(); INSERT INTO target_tab (id, column1, column2) SELECT id, column1, column2 FROM source_tab ON CONFLICT (id) DO UPDATE SET column1 = EXCLUDED.column1, column2 = EXCLUDED.column2; get diagnostics rows_upserted=row_count; end_time := clock_timestamp(); RAISE NOTICE 'UPSERT approach execution time: %', end_time - start_time; RAISE NOTICE 'UPSERT approach rows: %', rows_upserted; rollback; END $$; **************** Traditional Insert+update Testcase *********** drop table source_tab; drop table target_tab; CREATE TABLE source_tab ( id SERIAL PRIMARY KEY, column1 VARCHAR(100), column2 VARCHAR(100) ); -- Create target table CREATE TABLE target_tab ( id SERIAL PRIMARY KEY, column1 VARCHAR(100), column2 VARCHAR(100) ); INSERT INTO source_tab (column1, column2) SELECT 'Value ' || i, 'Value ' || (i * 2) FROM generate_series(1, 1000000) AS i; INSERT INTO target_tab (column1, column2) SELECT 'Value ' || i, 'Value ' || (i * 2) FROM generate_series(1, 500000) AS i; DO $$ DECLARE start_time timestamp; end_time timestamp; rows_inserted integer:=0; rows_updated integer:=0; rows_upserted integer:=0; rows_merged integer:=0; BEGIN -- Measure performance of insert/update approach start_time := clock_timestamp(); -- Update existing records UPDATE target_tab AS t SET column1 = s.column1, column2 = s.column2 FROM source_tab AS s WHERE t.id = s.id; get diagnostics rows_updated=row_count; -- Insert new records INSERT INTO target_tab (id, column1, column2) SELECT s.id, s.column1, s.column2 FROM source_tab AS s LEFT JOIN target_tab AS t ON s.id = t.id WHERE t.id IS NULL; get diagnostics rows_inserted=row_count; end_time := clock_timestamp(); RAISE NOTICE 'insert/update approach execution time: %', end_time - start_time; RAISE NOTICE 'insert/update approach update rows : %', rows_updated; RAISE NOTICE 'insert/update approach Insert rows: %', rows_inserted; rollback; END $$; **************** MERGE Testcase *********** drop table source_tab; drop table target_tab; CREATE TABLE source_tab ( id SERIAL PRIMARY KEY, column1 VARCHAR(100), column2 VARCHAR(100) ); -- Create target table CREATE TABLE target_tab ( id SERIAL PRIMARY KEY, column1 VARCHAR(100), column2 VARCHAR(100) ); INSERT INTO source_tab (column1, column2) SELECT 'Value ' || i, 'Value ' || (i * 2) FROM generate_series(1, 1000000) AS i; INSERT INTO target_tab (column1, column2) SELECT 'Value ' || i, 'Value ' || (i * 2) FROM generate_series(1, 500000) AS i; DO $$ DECLARE start_time timestamp; end_time timestamp; rows_inserted integer:=0; rows_updated integer:=0; rows_upserted integer:=0; rows_merged integer:=0; begin start_time := clock_timestamp(); merge into target_tab t using source_tab s on t. id = s. id when matched then update set column1 = s.column1, column2 = s.column2 when not matched then insert values (id, column1, column2); get diagnostics rows_merged=row_count; end_time := clock_timestamp(); RAISE NOTICE 'MERGE approach execution time: %', end_time - start_time; RAISE NOTICE 'MERGE approach rows: %', rows_merged; rollback; END $$; **************** Truncate+load Testcase *********** drop table source_tab; drop table target_tab; CREATE TABLE source_tab ( id SERIAL PRIMARY KEY, column1 VARCHAR(100), column2 VARCHAR(100) ); -- Create target table CREATE TABLE target_tab ( id SERIAL PRIMARY KEY, column1 VARCHAR(100), column2 VARCHAR(100) ); INSERT INTO source_tab (column1, column2) SELECT 'Value ' || i, 'Value ' || (i * 2) FROM generate_series(1, 1000000) AS i; INSERT INTO target_tab (column1, column2) SELECT 'Value ' || i, 'Value ' || (i * 2) FROM generate_series(1, 500000) AS i; DO $$ DECLARE start_time timestamp; end_time timestamp; rows_inserted integer:=0; rows_updated integer:=0; rows_upserted integer:=0; rows_truncate_loaded integer:=0; begin start_time := clock_timestamp(); truncate table target_tab; INSERT INTO target_tab (id, column1, column2) SELECT s.id, s.column1, s.column2 FROM source_tab AS s; get diagnostics rows_truncate_loaded=row_count; end_time := clock_timestamp(); RAISE NOTICE 'truncate load approach execution time: %', end_time - start_time; RAISE NOTICE 'truncate load rows: %', rows_truncate_loaded; rollback; END $$; --0000000000000a0e8306156f7fb6 Content-Type: text/html; charset="UTF-8" Content-Transfer-Encoding: quoted-printable
Thank you Adrian, Greg and Veem.

=
I tried writing a small routine to see how the performance diffe= rs in these four approaches i.e. Upsert VS traditional update+insert VS Mer= ge vs Truncate+load.=C2=A0

Initially I was thinkin= g Upsert will perform the same as Merge as the logic looks similar but it s= eems it's the worst performing among all, not sure why , yet to know th= e reason though.=C2=A0Truncate+ load seems to be the best performing among = all. Hope i am doing it correctly. Please correct me if I'm wrong.

UPSERT approach execution tim= e: 00:00:20.921343
UPSERT approach rows: 1000000

insert/up= date approach execution time: 00:00:15.53612
insert/update approa= ch update rows : 500000
insert/update approach Insert rows: 500000
MERGE approach execution time: 00:00:14.884623
MERGE approach r= ows: 1000000

truncate load approach execution time: 00:00:07.4288= 26
truncate load rows: 1000000


*= ******** Routine ********
*************** UPSERT Testcase **********

drop table source_ta= b;
drop table target_tab;

CREATE TABLE source_tab (
id SERIAL = PRIMARY KEY,
column1 VARCHAR(100),
column2 VARCHAR(100)
);

= -- Create target table
CREATE TABLE target_tab (
id SERIAL PRIMARY KE= Y,
column1 VARCHAR(100),
column2 VARCHAR(100)
);

INSERT INT= O source_tab (column1, column2)
SELECT
'Value ' || i,
'= ;Value ' || (i * 2)
FROM generate_series(1, 1000000) AS i;

IN= SERT INTO target_tab (column1, column2)
SELECT
'Value ' || i,=
'Value ' || (i * 2)
FROM generate_series(1, 500000) AS i;
DO $$
DECLARE
start_time timestamp;
end_time timestamp;
ro= ws_inserted integer:=3D0;
rows_updated integer:=3D0;
rows_upserted in= teger:=3D0;
rows_merged integer:=3D0;
BEGIN
-- Measure performance= of UPSERT
start_time :=3D clock_timestamp();
INSERT INTO target_tab = (id, column1, column2)
SELECT id, column1, column2
FROM source_tabON CONFLICT (id) DO UPDATE
SET
column1 =3D EXCLUDED.column1,
colu= mn2 =3D EXCLUDED.column2;
get diagnostics rows_upserted=3Drow_count;
= end_time :=3D clock_timestamp();
RAISE NOTICE 'UPSERT approach execu= tion time: %', end_time - start_time;
RAISE NOTICE 'UPSERT appro= ach rows: %', rows_upserted;

rollback;
END $$;

****= *********** Traditional Insert+update Testcase **********

drop t= able source_tab;
drop table target_tab;

CREATE TABLE source_tab (=
id SERIAL PRIMARY KEY,
column1 VARCHAR(100),
column2 VARCHAR(100)=
);

-- Create target table
CREATE TABLE target_tab (
id SER= IAL PRIMARY KEY,
column1 VARCHAR(100),
column2 VARCHAR(100)
);
=
INSERT INTO source_tab (column1, column2)
SELECT
'Value '= || i,
'Value ' || (i * 2)
FROM generate_series(1, 1000000) A= S i;

INSERT INTO target_tab (column1, column2)
SELECT
'Val= ue ' || i,
'Value ' || (i * 2)
FROM generate_series(1, 50= 0000) AS i;

DO $$
DECLARE
start_time timestamp;
end_time ti= mestamp;
rows_inserted integer:=3D0;
rows_updated integer:=3D0;
ro= ws_upserted integer:=3D0;
rows_merged integer:=3D0;
BEGIN
-- Measu= re performance of insert/update approach
start_time :=3D clock_timestamp= ();
-- Update existing records
UPDATE target_tab AS t
SET
colum= n1 =3D s.column1,
column2 =3D s.column2
FROM source_tab AS s
WHERE= t.id =3D s.id;
= get diagnostics rows_updated=3Drow_count;

-- Insert new records
I= NSERT INTO target_tab (id, column1, column2)
SELECT s.id, s.column1, s.column2
FROM source_tab AS s
LEFT JOIN tar= get_tab AS t ON s.id =3D t.id
WHERE t.id IS NULL;
get diagno= stics rows_inserted=3Drow_count;

end_time :=3D clock_timestamp();RAISE NOTICE 'insert/update approach execution time: %', end_time = - start_time;
RAISE NOTICE 'insert/update approach update rows : %&#= 39;, rows_updated;
RAISE NOTICE 'insert/update approach Insert rows:= %', rows_inserted;

rollback;
END $$;

*************= ** MERGE Testcase **********

drop table source_tab;
drop tabl= e target_tab;

CREATE TABLE source_tab (
id SERIAL PRIMARY KEY,column1 VARCHAR(100),
column2 VARCHAR(100)
);

-- Create targe= t table
CREATE TABLE target_tab (
id SERIAL PRIMARY KEY,
column1 V= ARCHAR(100),
column2 VARCHAR(100)
);

INSERT INTO source_tab (c= olumn1, column2)
SELECT
'Value ' || i,
'Value ' ||= (i * 2)
FROM generate_series(1, 1000000) AS i;

INSERT INTO targe= t_tab (column1, column2)
SELECT
'Value ' || i,
'Value = ' || (i * 2)
FROM generate_series(1, 500000) AS i;

DO $$
D= ECLARE
start_time timestamp;
end_time timestamp;
rows_inserted int= eger:=3D0;
rows_updated integer:=3D0;
rows_upserted integer:=3D0;
= rows_merged integer:=3D0;
begin

start_time :=3D clock_timestamp()= ;

merge into
target_tab t
using source_tab s on
t. id =3D s= . id
when matched then
update
set column1 =3D s.column1,
column= 2 =3D s.column2
when not matched then
insert
values (id, column1, = column2);
get diagnostics rows_merged=3Drow_count;

end_time :=3D = clock_timestamp();
RAISE NOTICE 'MERGE approach execution time: %= 9;, end_time - start_time;
RAISE NOTICE 'MERGE approach rows: %'= , rows_merged;

rollback;
END $$;

*************** Trunca= te+load=C2=A0Testcase **********

drop table source_tab;
drop = table target_tab;

CREATE TABLE source_tab (
id SERIAL PRIMARY KEY= ,
column1 VARCHAR(100),
column2 VARCHAR(100)
);

-- Create t= arget table
CREATE TABLE target_tab (
id SERIAL PRIMARY KEY,
colum= n1 VARCHAR(100),
column2 VARCHAR(100)
);

INSERT INTO source_ta= b (column1, column2)
SELECT
'Value ' || i,
'Value '= ; || (i * 2)
FROM generate_series(1, 1000000) AS i;

INSERT INTO t= arget_tab (column1, column2)
SELECT
'Value ' || i,
'Va= lue ' || (i * 2)
FROM generate_series(1, 500000) AS i;

DO $$<= br>DECLARE
start_time timestamp;
end_time timestamp;
rows_inserted= integer:=3D0;
rows_updated integer:=3D0;
rows_upserted integer:=3D0;=
rows_truncate_loaded integer:=3D0;
begin

start_time :=3D cloc= k_timestamp();

truncate table target_tab;

INSERT INTO target_= tab (id, column1, column2)
SELECT s.id, s.co= lumn1, s.column2
FROM source_tab AS s;
get diagnostics rows_truncate_= loaded=3Drow_count;

end_time :=3D clock_timestamp();
RAISE NOTICE= 'truncate load approach execution time: %', end_time - start_time;=
RAISE NOTICE 'truncate load rows: %', rows_truncate_loaded;
=
rollback;
END $$;

--0000000000000a0e8306156f7fb6--