PostgreSQL Procedural Languages:
Tips, Tricks & Gotchas
Who Am I?
● Jim Mlodgenski– [email protected]
– @jim_mlodgenski
● Co-organizer of– NYC PUG (www.nycpug.org)
– Philly PUG (www.phlpug.org)
● CTO, OpenSCG– www.openscg.com
Stored procedures/functions
● Code that runs inside of the database● Used for:
– Performance
– Security
– Convenience
functions=# SELECT airport FROM bird_strikes LIMIT 5; airport -------------------------- NEWARK LIBERTY INTL ARPT UNKNOWN DENVER INTL AIRPORT CHICAGO O'HARE INTL ARPT JOHN F KENNEDY INTL(5 rows)
Source: http://wildlife.faa.gov/
Sample Data
functions=# SELECT count(*)functions-# FROM bird_strikesfunctions-# WHERE get_iata_code_from_abbr_name(airport) = 'LAX'; count ------- 850(1 row)
Time: 13490.611 ms
Data Formatting Functions
functions=# EXPLAIN ANALYZE SELECT count(*) FROM bird_strikes ... QUERY PLAN ------------------------------------------------------------------------ Aggregate (cost=29418.79..29418.80 rows=1 width=0) (actual time=13463.628..13463.629 rows=1 loops=1) -> Seq Scan on bird_strikes (cost=0.00..29417.55 rows=497 width=0) (actual time=15.721..13463.293 rows=850 loops=1) Filter: ((get_iata_code_from_abbr_name(airport))::text = 'LAX'::text) Rows Removed by Filter: 98554 Planning time: 0.124 ms Execution time: 13463.682 ms(6 rows)
Check Performance
functions=# set track_functions = 'pl';SET
functions=# select * from pg_stat_user_functions;(No rows)
functions=# SELECT count(*) FROM bird_strikes ...-[ RECORD 1 ]count | 850
Track Function Usage
functions=# select * from pg_stat_user_functions;-[ RECORD 1 ]----------------------------funcid | 41247schemaname | publicfuncname | get_iata_code_from_namecalls | 88547total_time | 12493.419self_time | 12493.419-[ RECORD 2 ]----------------------------funcid | 41246schemaname | publicfuncname | get_iata_code_from_abbr_namecalls | 99404total_time | 13977.674self_time | 1484.255
Isolate Performance Issues
CREATE OR REPLACE FUNCTION get_iata_code_from_abbr_name(abbr_name varchar) RETURNS varchar AS$$DECLARE working_name varchar; code varchar := null;BEGIN working_name := upper(abbr_name);
IF working_name = 'UNKNOWN' THEN RETURN null; END IF;
working_name := replace(working_name, 'INTL', 'INTERNATIONAL'); working_name := replace(working_name, 'ARPT', 'AIRPORT'); working_name := replace(working_name, 'MUNI', 'MUNICIPAL'); working_name := replace(working_name, 'METRO', 'METROPOLITAN'); working_name := replace(working_name, 'NATL', 'NATIONAL');
working_name := replace(working_name, '-', ' '); working_name := replace(working_name, '/', ' '); working_name := working_name || '%';
code := get_iata_code_from_name(working_name);
RETURN code;END;$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION get_iata_code_from_name(airport_name varchar) RETURNS varchar AS$$DECLARE working_name varchar; code varchar := null;BEGIN working_name := upper(airport_name);
EXECUTE $__$ SELECT iata_code FROM airports WHERE upper(name) LIKE $1 $__$ INTO code USING working_name;
RETURN code;END;$$ LANGUAGE plpgsql;
Debugger
http://git.postgresql.org/gitweb/?p=pldebugger.git
functions=# select * from pl_profiler ; func_oid | line_number | line | exec_count | total_time | longest_time ----------+-------------+---------------------------------------------------------------------+------------+------------+-------------- 41246 | 1 | | 0 | 0 | 0 41246 | 2 | DECLARE | 0 | 0 | 0 41246 | 3 | working_name varchar; | 0 | 0 | 0 41246 | 4 | code varchar := null; | 0 | 0 | 0 41246 | 5 | BEGIN | 0 | 0 | 0 41246 | 6 | working_name := upper(abbr_name); | 99404 | 210587 | 363 41246 | 7 | | 0 | 0 | 0 41246 | 8 | IF working_name = 'UNKNOWN' THEN | 99404 | 63406 | 97 41246 | 9 | RETURN null; | 10857 | 2744 | 15 41246 | 10 | END IF; | 0 | 0 | 0 41246 | 11 | | 0 | 0 | 0 41246 | 12 | working_name := replace(working_name, 'INTL', 'INTERNATIONAL'); | 88547 | 116474 | 145 41246 | 13 | working_name := replace(working_name, 'ARPT', 'AIRPORT'); | 88547 | 83015 | 91 41246 | 14 | working_name := replace(working_name, 'MUNI', 'MUNICIPAL'); | 88547 | 70676 | 74 41246 | 15 | working_name := replace(working_name, 'METRO', 'METROPOLITAN'); | 88547 | 67392 | 63 41246 | 16 | working_name := replace(working_name, 'NATL', 'NATIONAL'); | 88547 | 64681 | 70 41246 | 17 | | 0 | 0 | 0 41246 | 18 | working_name := replace(working_name, '-', ' '); | 88547 | 66771 | 62 41246 | 19 | working_name := replace(working_name, '/', ' '); | 88547 | 65054 | 66 41246 | 20 | working_name := working_name || '%'; | 88547 | 64892 | 207 41246 | 21 | | 0 | 0 | 0 41246 | 22 | code := get_iata_code_from_name(working_name); | 88547 | 12282997 | 3709 41246 | 23 | | 0 | 0 | 0 41246 | 24 | RETURN code; | 88547 | 33374 | 14 41246 | 25 | END; | 0 | 0 | 0 41247 | 1 | | 0 | 0 | 0 41247 | 2 | DECLARE | 0 | 0 | 0 41247 | 3 | working_name varchar; | 0 | 0 | 0 41247 | 4 | code varchar := null; | 0 | 0 | 0 41247 | 5 | BEGIN | 0 | 0 | 0 41247 | 6 | working_name := upper(airport_name); | 88547 | 170273 | 90 41247 | 7 | | 0 | 0 | 0 41247 | 8 | EXECUTE $__$ SELECT iata_code | 88547 | 11572604 | 3273 41247 | 9 | FROM airports | 0 | 0 | 0 41247 | 10 | WHERE upper(name) LIKE $1 | 0 | 0 | 0 41247 | 11 | $__$ | 0 | 0 | 0 41247 | 12 | INTO code | 0 | 0 | 0 41247 | 13 | USING working_name; | 0 | 0 | 0 41247 | 14 | | 0 | 0 | 0 41247 | 15 | RETURN code; | 88547 | 121574 | 27 41247 | 16 | END; | 0 | 0 | 0(41 rows)
Profiler
https://bitbucket.org/openscg/plprofiler
● Be careful when you have a function call another function– May lead to difficult to diagnose
performance problems
● Be careful when a function is used in a WHERE clause– For sequential scans, it may
execute once per row in the table
functions=# SELECT iso_region FROM airports LIMIT 5; iso_region ------------ US-PA US-AK US-AL US-AR US-AZ(5 rows)
Source: http://ourairports.com/data/
Sample Data
CREATE TYPE airport_regions AS (airport_name varchar, airport_continent varchar, airport_country varchar, airport_state varchar);
CREATE OR REPLACE FUNCTION get_airport_regions() RETURNS SETOF airport_regions AS$$BEGIN RETURN QUERY SELECT name::varchar, continent::varchar, iso_country::varchar, split_part(iso_region, '-', 2)::varchar FROM airports;END;$$ LANGUAGE plpgsql;
Set Returning Functions
functions=# SELECT b.num_wildlife_struck FROM bird_strikes b, state_code s, get_airport_regions() r WHERE b.origin_state = s.name AND s.abbreviation = r.airport_state AND r.airport_continent = 'NA'; num_wildlife_struck ---------------------…Time: 48507.635 ms
QUERY PLAN ----------------------------------------------------------------------------------------------------------- Nested Loop (cost=42.10..318.77 rows=1972 width=2) (actual time=43.468..38467.229 rows=60334427 loops=1) -> Hash Join (cost=12.81..14.51 rows=1 width=9) (actual time=43.284..58.007 rows=21488 loops=1) Hash Cond: ((s.abbreviation)::text = (r.airport_state)::text) -> Seq Scan on state_code s (cost=0.00..1.50 rows=50 width=12) (actual time=0.007..0.045 rows=50 loops=1) -> Hash (cost=12.75..12.75 rows=5 width=32) (actual time=43.264..43.264 rows=25056 loops=1) Buckets: 1024 Batches: 1 Memory Usage: 857kB -> Function Scan on get_airport_regions r (cost=0.25..12.75 rows=5 width=32) (actual time=34.050..39.650 rows=25056 loops=1) Filter: ((airport_continent)::text = 'NA'::text) Rows Removed by Filter: 21150 -> Bitmap Heap Scan on bird_strikes b (cost=29.29..288.48 rows=1578 width=10) (actual time=0.445..1.343 rows=2808 loops=21488) Recheck Cond: ((origin_state)::text = (s.name)::text) Heap Blocks: exact=31639334 -> Bitmap Index Scan on bird_strikes_state (cost=0.00..28.89 rows=1578 width=0) (actual time=0.285..0.285 rows=2808 loops=21488) Index Cond: ((origin_state)::text = (s.name)::text) Planning time: 0.742 ms Execution time: 40447.925 ms(16 rows)
Time: 40449.209 ms
CREATE OR REPLACE FUNCTION get_airport_regions() RETURNS SETOF airport_regions AS$$BEGIN RETURN QUERY SELECT name::varchar, continent::varchar, iso_country::varchar, split_part(iso_region, '-', 2)::varchar FROM airports;END;$$ LANGUAGE plpgsqlROWS 46206COST 600000;
QUERY PLAN -------------------------------------------------------------------------------------------------------------- Hash Join (cost=2081.87..7687.83 rows=91120 width=2) (actual time=51.589..7568.729 rows=60334427 loops=1) Hash Cond: ((b.origin_state)::text = (s.name)::text) -> Seq Scan on bird_strikes b (cost=0.00..4318.04 rows=99404 width=10) (actual time=0.006..14.207 rows=99404 loops=1) -> Hash (cost=2081.15..2081.15 rows=58 width=9) (actual time=51.571..51.571 rows=21488 loops=1) Buckets: 1024 Batches: 1 Memory Usage: 861kB -> Hash Join (cost=1502.12..2081.15 rows=58 width=9) (actual time=37.574..48.385 rows=21488 loops=1) Hash Cond: ((r.airport_state)::text = (s.abbreviation)::text) -> Function Scan on get_airport_regions r (cost=1500.00..2077.57 rows=231 width=32) (actual time=37.526..42.626 rows=25056 loops=1) Filter: ((airport_continent)::text = 'NA'::text) Rows Removed by Filter: 21150 -> Hash (cost=1.50..1.50 rows=50 width=12) (actual time=0.041..0.041 rows=50 loops=1) Buckets: 1024 Batches: 1 Memory Usage: 3kB -> Seq Scan on state_code s (cost=0.00..1.50 rows=50 width=12) (actual time=0.004..0.020 rows=50 loops=1) Planning time: 0.722 ms Execution time: 9572.353 ms(15 rows)
Time: 9573.716 ms
● When using set returning functions as tables, the row and cost estimates are usually way off– Default ROWS: 1000
– Default COST: 100● Note: COST is in units of
cpu_operator_cost which is 0.0025
● Do not use functions to mask a bad data model
● Use functions to help load the data into the correct format
Table Partitioning
● Usually done for performance● Uses check constraints and inherited tables● Triggers are preferred over rules so COPY can be used● Trigger functions used to move the data to the correct
child table
CREATE UNLOGGED TABLE trigger_test (key serial primary key, value varchar, insert_ts timestamp, update_ts timestamp);
CREATE UNLOGGED TABLE trigger_test_0 (CHECK ( key % 5 = 0)) INHERITS (trigger_test);CREATE UNLOGGED TABLE trigger_test_1 (CHECK ( key % 5 = 1)) INHERITS (trigger_test);CREATE UNLOGGED TABLE trigger_test_2 (CHECK ( key % 5 = 2)) INHERITS (trigger_test);CREATE UNLOGGED TABLE trigger_test_3 (CHECK ( key % 5 = 3)) INHERITS (trigger_test);CREATE UNLOGGED TABLE trigger_test_4 (CHECK ( key % 5 = 4)) INHERITS (trigger_test);
CREATE OR REPLACE FUNCTION partition_trigger() RETURNS trigger AS $$DECLARE partition int;BEGIN partition = NEW.key % 5;
EXECUTE 'INSERT INTO trigger_test_' || partition || ' VALUES (($1).*)' USING NEW;
RETURN NULL; END;$$ LANGUAGE plpgsql;
CREATE TRIGGER partition_trigger BEFORE INSERT ON trigger_test FOR EACH ROW EXECUTE PROCEDURE partition_trigger();
Dynamic Trigger
CREATE OR REPLACE FUNCTION partition_trigger() RETURNS trigger AS $$BEGIN CASE NEW.key % 5 WHEN 0 THEN INSERT INTO trigger_test_0 VALUES (NEW.*); WHEN 1 THEN INSERT INTO trigger_test_1 VALUES (NEW.*); WHEN 2 THEN INSERT INTO trigger_test_2 VALUES (NEW.*); WHEN 3 THEN INSERT INTO trigger_test_3 VALUES (NEW.*); WHEN 4 THEN INSERT INTO trigger_test_4 VALUES (NEW.*); END CASE; RETURN NULL; END;$$ LANGUAGE plpgsql;
Case Statement
● 16% performance gain using CASE Statement● Tested inserting 100,000 rows
Dynamic Trigger Case Trigger3200
3400
3600
3800
4000
4200
4400
Performance of Partition Triggers
Trigger Overhead
● Triggers get executed when an event happens in the database– INSERT, UPDATE, DELETE
● Event Triggers fire on DDL– CREATE, DROP, ALTER
CREATE UNLOGGED TABLE trigger_test (
key serial primary key,
value varchar,
insert_ts timestamp,
update_ts timestamp
);
INSERTS.pgbench
INSERT INTO trigger_test (value) VALUES (‘hello’);
pgbench -n -t 100000
-f INSERTS.pgbench functions
Inserts: 5191 TPS
CREATE FUNCTION empty_trigger() RETURNS trigger AS $$
BEGIN
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER empty_trigger BEFORE INSERT OR UPDATE ON trigger_test
FOR EACH ROW EXECUTE PROCEDURE empty_trigger();
pgbench -n -t 100000
-f INSERTS.pgbench functions
Inserts: 4906 TPS (5.5% overhead)
Overhead of PL Languages
● PL/pgSQL● C● PL/Perl● PL/TCL● PL/Python
● PL/v8● PL/Lua● PL/R● PL/sh
PL/pgSQL
CREATE FUNCTION empty_trigger() RETURNS trigger AS $$
BEGIN
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
C#include "postgres.h"
#include "commands/trigger.h"
PG_MODULE_MAGIC;
Datum empty_c_trigger(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(empty_c_trigger);
Datum
empty_c_trigger(PG_FUNCTION_ARGS)
{
TriggerData *tg;
HeapTuple ret;
tg = (TriggerData *) (fcinfo->context);
if (TRIGGER_FIRED_BY_UPDATE(tg->tg_event))
ret = tg->tg_newtuple;
else
ret = tg->tg_trigtuple;
return PointerGetDatum(ret);
}
PL/Python
CREATE FUNCTION empty_python_trigger()
RETURNS trigger AS
$$
return
$$ LANGUAGE plpythonu;
PL/Perl
CREATE FUNCTION empty_perl_trigger()
RETURNS trigger AS
$$
return;
$$ LANGUAGE plperl;
PL/TCL
CREATE FUNCTION empty_tcl_trigger()
RETURNS trigger AS
$$
return [array get NEW]
$$ LANGUAGE pltcl;
PL/v8
CREATE FUNCTION empty_v8_trigger()
RETURNS trigger AS
$$
return NEW;
$$
LANGUAGE plv8;
PL/R
CREATE FUNCTION empty_r_trigger()
RETURNS trigger AS
$$
return(pg.tg.new)
$$ LANGUAGE plr;
PL/Lua
CREATE FUNCTION empty_lua_trigger()
RETURNS trigger AS
$$
return
$$ LANGUAGE pllua;
PL/sh
CREATE FUNCTION empty_sh_trigger()
RETURNS trigger AS
$$
#!/bin/sh
exit 0
$$ LANGUAGE plsh;
C PL/pgSQL PL/Lua PL/Python PL/Perl PL/v8 PL/TCL PL/R PL/sh0.00%
10.00%
20.00%
30.00%
40.00%
50.00%
60.00%
70.00%
80.00%
90.00%
100.00%
Percent overhead of triggers
● Think things through before adding server side code
● Performance test your functions● Don't use a procedural language
just because it's cool– Use the right tool for the job
Questions?