2022年6月26日 星期日

postgresql for everybody 2022

general

Charles Russell Severance


psql -h pg.pg4e.com -p 5432 -U pg4e_4f1d91fcd9 pg4e_4f1d91fcd9

\l

\s
\q

\d+
\dt

init

CREATE TABLE pg4e_debug (

  id SERIAL,

  query VARCHAR(4096),

  result VARCHAR(4096),

  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,

  PRIMARY KEY(id)

);

select * from pg4e_debug;

select count(*) from pg4e_debug;

SELECT query, result, created_at FROM pg4e_debug;


CREATE TABLE pg4e_result (
  id SERIAL,
  link_id INTEGER UNIQUE,
  score FLOAT,
  title VARCHAR(4096),
  note VARCHAR(4096),
  debug_log VARCHAR(8192),
  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  updated_at TIMESTAMP
);


insert

create table ages (name varchar(128), age integer);

select count(*) from ages;

insert into ages (name, age) values ('Darcey', 23);

select * from ages;

INSERT INTO ages (name, age) VALUES ('Evann', 15);
INSERT INTO ages (name, age) VALUES ('Holly', 23);
INSERT INTO ages (name, age) VALUES ('Kiaran', 21);
INSERT INTO ages (name, age) VALUES ('Mehmet', 22);

create

create table automagic (id serial, name varchar(32) not null, height float not null);

Musical Track Database (CSV)



wget https://www.pg4e.com/tools/sql/library.csv
curl -O https://www.pg4e.com/tools/sql/library.csv

create table track_raw (title text, artist text, album text, count integer, rating integer, len integer);
\copy track_raw(title,artist,album,count,rating,len) from 'library.csv' with delimiter ',' CSV;
SELECT title, album FROM track_raw ORDER BY title LIMIT 3;


Cars (integer references, on delete cascade)

CREATE TABLE make (
    id SERIAL,
    name VARCHAR(128) UNIQUE,
    PRIMARY KEY(id)
);

CREATE TABLE model (
  id SERIAL,
  name VARCHAR(128),
  make_id INTEGER REFERENCES make(id) ON DELETE CASCADE,
  PRIMARY KEY(id)
);

insert into model (name, make_id) values ('Silverado C15 2WD', (select id from make where name='Chevrolet'));
insert into model (name, make_id) values ('Silverado C15 Cab Chassis 2WD', (select id from make where name='Chevrolet'));
insert into model (name, make_id) values ('Silverado C15 XFE 2WD', (select id from make where name='Chevrolet'));
insert into model (name, make_id) values ('Portofino', (select id from make where name='Ferrari'));
insert into model (name, make_id) values ('Testarossa', (select id from make where name='Ferrari'));

SELECT make.name, model.name
    FROM model
    JOIN make ON model.make_id = make.id
    ORDER BY make.name LIMIT 5;

Building a many-to-many roster


CREATE TABLE student (
    id SERIAL,
    name VARCHAR(128) UNIQUE,
    PRIMARY KEY(id)
);

DROP TABLE course CASCADE;
CREATE TABLE course (
    id SERIAL,
    title VARCHAR(128) UNIQUE,
    PRIMARY KEY(id)
);

DROP TABLE roster CASCADE;
CREATE TABLE roster (
    id SERIAL,
    student_id INTEGER REFERENCES student(id) ON DELETE CASCADE,
    course_id INTEGER REFERENCES course(id) ON DELETE CASCADE,
    role INTEGER,
    UNIQUE(student_id, course_id),
    PRIMARY KEY (id)
);

for data
Parmin, si106, Instructor
Marybeth, si106, Learner
Qirui, si106, Learner
Stacey, si106, Learner
Tighan, si106, Learner
Shelbie, si110, Instructor
Andi, si110, Learner
Arron, si110, Learner
Bruin, si110, Learner
Stewarty, si110, Learner
Tamika, si206, Instructor
Aimie, si206, Learner
Jaslyn, si206, Learner
Nidba, si206, Learner
Sarahlouise, si206, Learner

\copy student(name) from 'students.csv' with delimiter ',' CSV;
insert into course (title) values ('si106'), ('si110'), ('si206');

18,1,1
19,1,0
20,1,0
21,1,0
22,1,0
23,2,1
24,2,0
25,2,0
26,2,0
27,2,0
28,3,1
29,3,0
30,3,0
31,3,0
32,3,0

\copy roster(student_id, course_id, role) from 'relation.csv' with delimiter ',' CSV;

SELECT student.name, course.title, roster.role
    FROM student 
    JOIN roster ON student.id = roster.student_id
    JOIN course ON roster.course_id = course.id
    ORDER BY course.title, roster.role DESC, student.name;

Intermediate SQL


Alter table
alter table pg4e_debug add neon311 integer;
SELECT neon311 FROM pg4e_debug LIMIT 1;

describe table
\d+ taxdata
  Column  |          Type          |
----------+------------------------+
 id       | integer                |
 ein      | integer                |
 name     | character varying(255) |
 year     | integer                |
 revenue  | bigint                 |
 expenses | bigint                 |
 purpose  | text                   |
 ptid     | character varying(255) |
 ptname   | character varying(255) |
 city     | character varying(255) |
 state    | character varying(255) |
 url      | character varying(255) |

distinct
select distinct state from taxdata order by state limit 5;

stored procedure

create table
CREATE TABLE keyvalue ( 
  id SERIAL,
  key VARCHAR(128) UNIQUE,
  value VARCHAR(128) UNIQUE,
  created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
  updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
  PRIMARY KEY(id)
);

create function
create or replace function trigger_set_timestamp()
returns trigger as $$
begin
  NEW.updated_at = NOW();
  return NEW;
end
$$ language plpgsql;

create trigger
create trigger set_timestamp
before update on keyvalue
for each row
execute procedure trigger_set_timestamp();

insert
INSERT INTO keyvalue (key, value) VALUES ('apple', '1') RETURNING *;

Musical Tracks Many-to-One


create table
CREATE TABLE album (
  id SERIAL,
  title VARCHAR(128) UNIQUE,
  PRIMARY KEY(id)
);
CREATE TABLE track (
    id SERIAL,
    title VARCHAR(128),
    len INTEGER, rating INTEGER, count INTEGER,
    album_id INTEGER REFERENCES album(id) ON DELETE CASCADE,
    UNIQUE(title, album_id),
    PRIMARY KEY(id)
);
DROP TABLE IF EXISTS track_raw;
CREATE TABLE track_raw
 (title TEXT, artist TEXT, album TEXT, album_id INTEGER,
  count INTEGER, rating INTEGER, len INTEGER);

csv
\copy track_raw(title, artist, album, count, rating, len) from 'library.csv' with delimiter ',' CSV;

distinct albums into the album table
insert into album (title) select distinct album from track_raw order by album;

update album_id
UPDATE track_raw SET album_id = (SELECT album.id FROM album WHERE album.title = track_raw.album);

insert into track
insert into track (title, len, rating, count, album_id) select title, len, rating, count, album_id from track_raw;

Unesco Heritage Sites Many-to-One

psql -h pg.pg4e.com -p 5432 -U pg4e_ecb4afa439 pg4e_ecb4afa439

create raw table (unnormalized, from csv)

DROP TABLE unesco_raw;
CREATE TABLE unesco_raw
 (name TEXT, description TEXT, justification TEXT, year INTEGER,
    longitude FLOAT, latitude FLOAT, area_hectares FLOAT,
    category TEXT, category_id INTEGER, state TEXT, state_id INTEGER,
    region TEXT, region_id INTEGER, iso TEXT, iso_id INTEGER);

\copy unesco_raw(name,description,justification,year,longitude,latitude,area_hectares,category,state,region,iso) FROM 'whc-sites-2018-small.csv' WITH DELIMITER ',' CSV HEADER;

lookup tables

CREATE TABLE category (
  id SERIAL,
  name VARCHAR(128) UNIQUE,
  PRIMARY KEY(id)
);

CREATE TABLE iso (
  id SERIAL,
  name VARCHAR(128) UNIQUE,
  PRIMARY KEY(id)
);

CREATE TABLE state (
  id SERIAL,
  name VARCHAR(128) UNIQUE,
  PRIMARY KEY(id)
);

CREATE TABLE region (
  id SERIAL,
  name VARCHAR(128) UNIQUE,
  PRIMARY KEY(id)
);

insert into category (name) select distinct category from unesco_raw order by category;
insert into iso (name) select distinct iso from unesco_raw order by iso;
insert into state (name) select distinct state from unesco_raw order by state;
insert into region (name) select distinct region from unesco_raw order by region;

add id to raw table

UPDATE unesco_raw SET category_id = (SELECT category.id FROM category WHERE category.name = unesco_raw.category);
UPDATE unesco_raw SET iso_id = (SELECT iso.id FROM iso WHERE iso.name = unesco_raw.iso);
UPDATE unesco_raw SET state_id = (SELECT state.id FROM state WHERE state.name = unesco_raw.state);
UPDATE unesco_raw SET region_id = (SELECT region.id FROM region WHERE region.name = unesco_raw.region);

create normalised table (by dropping text columns and rename table)

ALTER TABLE unesco_raw DROP COLUMN category;
ALTER TABLE unesco_raw DROP COLUMN iso;
ALTER TABLE unesco_raw DROP COLUMN state;
ALTER TABLE unesco_raw DROP COLUMN region;
ALTER TABLE unesco_raw RENAME TO unesco;

Musical Track Database plus Artists


DROP TABLE album CASCADE;
CREATE TABLE album (
    id SERIAL,
    title VARCHAR(128) UNIQUE,
    PRIMARY KEY(id)
);

DROP TABLE track CASCADE;
CREATE TABLE track (
    id SERIAL,
    title TEXT, 
    artist TEXT, 
    album TEXT, 
    album_id INTEGER REFERENCES album(id) ON DELETE CASCADE,
    count INTEGER, 
    rating INTEGER, 
    len INTEGER,
    PRIMARY KEY(id)
);

DROP TABLE artist CASCADE;
CREATE TABLE artist (
    id SERIAL,
    name VARCHAR(128) UNIQUE,
    PRIMARY KEY(id)
);

DROP TABLE tracktoartist CASCADE;
CREATE TABLE tracktoartist (
    id SERIAL,
    track VARCHAR(128),
    track_id INTEGER REFERENCES track(id) ON DELETE CASCADE,
    artist VARCHAR(128),
    artist_id INTEGER REFERENCES artist(id) ON DELETE CASCADE,
    PRIMARY KEY(id)
);

\copy track(title,artist,album,count,rating,len) FROM 'library.csv' WITH DELIMITER ',' CSV;

INSERT INTO album (title) SELECT DISTINCT album FROM track;
UPDATE track SET album_id = (SELECT album.id FROM album WHERE album.title = track.album);


INSERT INTO tracktoartist (track, artist) SELECT DISTINCT title, artist from track;

INSERT INTO artist (name) SELECT DISTINCT artist FROM track;

UPDATE tracktoartist SET track_id = (select track.id from track where track.title = tracktoartist.track);

UPDATE tracktoartist SET artist_id = (select artist.id from artist where artist.name = tracktoartist.artist);

ALTER TABLE track DROP COLUMN album;
ALTER TABLE track DROP COLUMN artist;
ALTER TABLE tracktoartist DROP COLUMN track;
ALTER TABLE tracktoartist DROP COLUMN artist;

SELECT track.title, album.title, artist.name
FROM track
JOIN album ON track.album_id = album.id
JOIN tracktoartist ON track.id = tracktoartist.track_id
JOIN artist ON tracktoartist.artist_id = artist.id
LIMIT 3;

Generating Text

expected
This is record number 100000 of quite a few text records.
This is record number 100001 of quite a few text records.
...
This is record number 199998 of quite a few text records.
This is record number 199999 of quite a few text records.

from generate_series(100000,199999) i
create table bigtext (content text);
insert into bigtext (content)
select CONCAT('This is record number ', i, ' of quite a few text records.') from generate_series(100000,199999) i;

Regular Expression Queries

looking for lines where the very first character was an upper case character letter
SELECT purpose FROM taxdata WHERE purpose ~ '^[A-Z]' ORDER BY purpose DESC LIMIT 3;

Lines that are all upper case letters and spaces and nothing else
SELECT purpose FROM taxdata WHERE purpose ~ '^[A-Z\s]+$' ORDER BY purpose DESC LIMIT 3;
SELECT purpose FROM taxdata WHERE purpose ~ '^[A-Z[:space:]]+$' ORDER BY purpose DESC LIMIT 3;
SELECT purpose FROM taxdata WHERE purpose ~ '^[[:upper:][:space:]]+$' ORDER BY purpose DESC LIMIT 3;

 YOUTH WRITING WORKSHOPS ACADEMIC SUPPORT 
 YOUTH WRESTLING CLUB SUPPORTED YOUTH WRESTLING
 YOUTH WORK THERAPY PROGRAMS

Regular Expressions


For 'hello'
the greedy h.+l matches 'hell', longest possible string.
the lazy h.+?l matches 'hel', shortest possible string.


Reverse Index in SQL


https://www.pg4e.com/lectures/05-FullText.txt


drop table if exists docs01 cascade;
CREATE TABLE docs01 (id SERIAL, doc TEXT, PRIMARY KEY(id));
INSERT INTO docs01 (doc) VALUES
('Type help copyright credits or license for more'),
('The  prompt is the Python interpreters way of asking'),
('you What do you want me to do next Python is ready to have a'),
('conversation with you All you have to know is how to speak the Python'),
('Lets say for example that you did not know even the simplest Python'),
('language words or sentences You might want to use the standard line'),
('that astronauts use when they land on a faraway planet and try to speak'),
('with the inhabitants of the planet'),
('I come in peace please take me to your leader'),
('This is not going so well Unless you think of something quickly the');
select * from docs01;

drop table if exists invert01;
CREATE TABLE invert01 (
  keyword TEXT,
  doc_id INTEGER REFERENCES docs01(id) ON DELETE CASCADE
);


INSERT INTO invert01 (doc_id, keyword)
SELECT DISTINCT id, s.keyword AS keyword
FROM docs01, unnest(string_to_array(lower(docs01.doc), ' ')) s(keyword)
ORDER BY id;

SELECT keyword, doc_id FROM invert01 ORDER BY keyword, doc_id LIMIT 10;
   keyword    | doc_id 
--------------+--------
              |      2
 a            |      3
 a            |      7
 all          |      4
 and          |      7
 asking       |      2
 astronauts   |      7
 come         |      9
 conversation |      4
 copyright    |      1
(10 rows)


Reverse Index (with stop words) in SQL


drop table if exists docs02 cascade;
CREATE TABLE docs02 (id SERIAL, doc TEXT, PRIMARY KEY(id));
INSERT INTO docs02 (doc) VALUES
('Type help copyright credits or license for more'),
('The  prompt is the Python interpreters way of asking'),
('you What do you want me to do next Python is ready to have a'),
('conversation with you All you have to know is how to speak the Python'),
('Lets say for example that you did not know even the simplest Python'),
('language words or sentences You might want to use the standard line'),
('that astronauts use when they land on a faraway planet and try to speak'),
('with the inhabitants of the planet'),
('I come in peace please take me to your leader'),
('This is not going so well Unless you think of something quickly the');
select * from docs02;

CREATE TABLE stop_words (word TEXT unique);
INSERT INTO stop_words (word) VALUES 
('i'), ('a'), ('about'), ('an'), ('are'), ('as'), ('at'), ('be'), 
('by'), ('com'), ('for'), ('from'), ('how'), ('in'), ('is'), ('it'), ('of'), 
('on'), ('or'), ('that'), ('the'), ('this'), ('to'), ('was'), ('what'), 
('when'), ('where'), ('who'), ('will'), ('with');

drop table if exists invert02;
CREATE TABLE invert02 (
  keyword TEXT,
  doc_id INTEGER REFERENCES docs02(id) ON DELETE CASCADE
);
INSERT INTO invert02 (doc_id, keyword)
SELECT DISTINCT id, s.keyword AS keyword
FROM docs02, unnest(string_to_array(lower(docs02.doc), ' ')) s(keyword)
where s.keyword not in (select * from stop_words)
ORDER BY id;

SELECT keyword, doc_id FROM invert02 ORDER BY keyword, doc_id LIMIT 10;
   keyword    | doc_id 
--------------+--------
              |      2
 all          |      4
 and          |      7
 asking       |      2
 astronauts   |      7
 come         |      9
 conversation |      4
 copyright    |      1
 credits      |      1
 did          |      5
(10 rows)

String Array GIN Index

-- create table
drop table if exists docs03 cascade;
CREATE TABLE docs03 (id SERIAL, doc TEXT, PRIMARY KEY(id));
INSERT INTO docs03 (doc) VALUES
('Type help copyright credits or license for more'),
('The  prompt is the Python interpreters way of asking'),
('you What do you want me to do next Python is ready to have a'),
('conversation with you All you have to know is how to speak the Python'),
('Lets say for example that you did not know even the simplest Python'),
('language words or sentences You might want to use the standard line'),
('that astronauts use when they land on a faraway planet and try to speak'),
('with the inhabitants of the planet'),
('I come in peace please take me to your leader'),
('This is not going so well Unless you think of something quickly the');

-- seq scan
EXPLAIN SELECT id, doc FROM docs03 WHERE '{conversation}' <@ string_to_array(lower(doc), ' ');
                                   QUERY PLAN                                   
--------------------------------------------------------------------------------
 Seq Scan on docs03  (cost=0.00..32.23 rows=6 width=36)
   Filter: ('{conversation}'::text[] <@ string_to_array(lower(doc), ' '::text))
(2 rows)


-- create index
CREATE INDEX array03 ON docs03 USING gin(string_to_array(lower(doc), ' ')  array_ops);


-- filler rows to make sure PSQL uses its index
INSERT INTO docs03 (doc) SELECT 'Neon ' || generate_series(10000,20000);

-- bitmap heap scan
EXPLAIN SELECT id, doc FROM docs03 WHERE '{conversation}' <@ string_to_array(lower(doc), ' ');
                                        QUERY PLAN                                        
------------------------------------------------------------------------------------------
 Bitmap Heap Scan on docs03  (cost=12.01..16.03 rows=1 width=15)
   Recheck Cond: ('{conversation}'::text[] <@ string_to_array(lower(doc), ' '::text))
   ->  Bitmap Index Scan on array03  (cost=0.00..12.01 rows=1 width=0)
         Index Cond: ('{conversation}'::text[] <@ string_to_array(lower(doc), ' '::text))
(4 rows)

GIN ts_vector Index

-- create index
CREATE INDEX fulltext03 ON docs03 USING gin(to_tsvector('english', doc));

-- filler rows to make sure PSQL uses its index
INSERT INTO docs03 (doc) SELECT 'Neon ' || generate_series(10000,20000);

-- bitmap heap scan
EXPLAIN SELECT id, doc FROM docs03 WHERE to_tsquery('english', 'instructions') @@ to_tsvector('english', doc);
                                       QUERY PLAN                                        
-----------------------------------------------------------------------------------------
 Bitmap Heap Scan on docs03  (cost=12.39..81.75 rows=50 width=15)
   Recheck Cond: ('''instruct'''::tsquery @@ to_tsvector('english'::regconfig, doc))
   ->  Bitmap Index Scan on fulltext03  (cost=0.00..12.38 rows=50 width=0)
         Index Cond: ('''instruct'''::tsquery @@ to_tsvector('english'::regconfig, doc))
(4 rows)

Making a connection from Python


pip install psycopg2-binary

simple.py


# install psycopg2 (if needed)
# pip3 install psycopg2    # (or pip)

# https://www.pg4e.com/code/simple.py

# https://www.pg4e.com/code/hidden-dist.py
# copy hidden-dist.py to hidden.py
# edit hidden.py and put in your credentials

# python3 simple.py

# To check the results, use psql and look at the
# pythonfun table

import psycopg2
import hidden

# Load the secrets
secrets = hidden.secrets()

conn = psycopg2.connect(host=secrets['host'],
        port=secrets['port'],
        database=secrets['database'], 
        user=secrets['user'], 
        password=secrets['pass'], 
        connect_timeout=3)

cur = conn.cursor()

sql = 'DROP TABLE IF EXISTS pythonfun CASCADE;'
print(sql)
cur.execute(sql)

sql = 'CREATE TABLE pythonfun (id SERIAL, line TEXT);'
print(sql)
cur.execute(sql)

conn.commit()    # Flush it all to the DB server

for i in range(10) : 
    txt = "Have a nice day "+str(i)
    sql = 'INSERT INTO pythonfun (line) VALUES (%s);'
    cur.execute(sql, (txt, ))

conn.commit()

sql = "SELECT id, line FROM pythonfun WHERE id=5;" 
print(sql)
cur.execute(sql)

row = cur.fetchone()
if row is None : 
    print('Row not found')
else:
    print('Found', row)

sql = 'INSERT INTO pythonfun (line) VALUES (%s) RETURNING id;'
cur.execute(sql, (txt, ))
id = cur.fetchone()[0]
print('New id', id)

# Lets make a mistake
sql = "SELECT line FROM pythonfun WHERE mistake=5;"
print(sql)
cur.execute(sql)

conn.commit()
cur.close()

hidden.py

# Keep this file separate

# https://www.pg4e.com/code/hidden-dist.py

# psql -h pg.pg4e.com -p 5432 -U pg4e_be9e729093 pg4e_be9e729093

# %load_ext sql
# %config SqlMagic.autocommit=False
# %sql postgresql://pg4e_be9e729093:pg4e_p_d5fab7440699124@pg.pg4e.com:5432/pg4e_be9e729093
# %sql SELECT 1 as "Test"

def secrets():
    return {"host": "pg.pg4e.com",
            "port": 5432,
            "database": "pg4e_026ee23427",
            "user": "pg4e_026ee23427",
            "pass": "pg4e_p_7a84820f4ce3880"}

def elastic() :
    return {"host": "www.pg4e.com",
            "prefix" : "elasticsearch",
            "port": 443,
            "scheme": "https",
            "user": "pg4e_86f9be92a2",
            "pass": "2008_9d454b1f"}

def readonly():
    return {"host": "pg.pg4e.com",
            "port": 5432,
            "database": "readonly",
            "user": "readonly",
            "pass": "readonly_password"}

# Return a psycopg2 connection string

# import hidden
# secrets = hidden.readonly()
# sql_string = hidden.psycopg2(hidden.readonly())

# 'dbname=pg4e_data user=pg4e_data_read password=pg4e_p_d5fab7440699124 host=pg.pg4e.com port=5432'

def psycopg2(secrets) :
     return ('dbname='+secrets['database']+' user='+secrets['user']+
        ' password='+secrets['pass']+' host='+secrets['host']+
        ' port='+str(secrets['port']))

# Return an SQLAlchemy string

# import hidden
# secrets = hidden.readonly()
# sql_string = hidden.alchemy(hidden.readonly())

# postgresql://pg4e_data_read:pg4e_p_d5fab7440699124@pg.pg4e.com:5432/pg4e_data

def alchemy(secrets) :
    return ('postgresql://'+secrets['user']+':'+secrets['pass']+'@'+secrets['host']+
        ':'+str(secrets['port'])+'/'+secrets['database'])

A Sequence of Numbers

import psycopg2
import hidden

# Load the secrets
secrets = hidden.secrets()

conn = psycopg2.connect(host=secrets['host'],
        port=secrets['port'],
        database=secrets['database'], 
        user=secrets['user'], 
        password=secrets['pass'], 
        connect_timeout=3)

cur = conn.cursor()

sql = 'DROP TABLE IF EXISTS pythonseq CASCADE;'
print(sql)
cur.execute(sql)

sql = 'CREATE TABLE pythonseq (iter INTEGER, val INTEGER);'
print(sql)
cur.execute(sql)

conn.commit()    # Flush it all to the DB server

number = 242412
for i in range(300) :
    print(i+1, number)
    sql = 'INSERT INTO pythonseq (iter, val) VALUES (%s, %s);'
    cur.execute(sql, (i+1, number,))
    number = int((number * 22) / 7) % 1000000

conn.commit()
cur.close()


Interacting with the PokéAPI

conn = psycopg2.connect(host=secrets['host'],
                        port=secrets['port'],
                        database=secrets['database'],
                        user=secrets['user'],
                        password=secrets['pass'],
                        connect_timeout=3)
cur = conn.cursor()
cur.execute("CREATE TABLE IF NOT EXISTS pokeapi (id INTEGER, body JSONB);")

url: str = "https://pokeapi.co/api/v2/pokemon"
response = requests.get(url)
text = response.text
js = json.loads(text)
results = js.get("results", [])

sql = "INSERT INTO pokeapi (id, body) VALUES ( %s, %s );"
cur.execute(sql, (str(id), json.dumps(body),))
cur.execute(sql, (str(id), json.dumps(body),))
cur.execute(sql, (str(id), json.dumps(body),))
cur.execute(sql, (str(id), json.dumps(body),))
cur.execute(sql, (str(id), json.dumps(body),))
conn.commit()
cur.close()

Course 4 Database Architecture, Scale, and NoSQL

week 1 question

========
To Multi-Master or not to Multi-Master - that is the question
Your organization is reaching the limits of its relational database application as it scales and you are in meeting to decide to move to a multi-master approach or not. Expanding the size of the single-master instance is quite expensive and requires a long-term contract. The multi-master approach is much less expensive and can use commodity hardware. Take a position for or against the switch to multi-master proposal. Write 3-5 paragraphs and share your reasoning on the position you have taken. Feel free to bring prior experience that you might have into the conversation. Feel free to now more than the instructor on this topic :) Let's see what we can learn from talking and listening to each other's perspectives. The primary goal is to have a conversation.

Don't take off points for little mistakes. If they seem to have done the assignment give them full credit. Feel free to make suggestions if there are small mistakes. Please keep your comments positive and useful. If you do not take grading seriously, the instructors may delete your response and you will lose points.

answers
=======

I am for the switch to multi-master proposal. For a long term project development and business sustainability, the data size would reaches a single machine limit of that single master soonner or later. Horizontal scaling should be taken into account.

Expanding the size of the current single-master may solve the problem in a short term, which lead to a shorter lead time to solve our problem. However it requires a higher hardware cost. And we are at the age of high speed hardware development where the highest end of computer could possibly be replaced by a tiny micro computing device in the future. In other words the high end hardware that we may be purchasing today could depreciate drastically in the coming years. This happens in the recent past that the high end mainframe computational power is surpassed by a handheld smartphone.

Switching from a single-master to multi-master usually requires a slightly more complex data flow design. For example we need to figure out whether there may be any part of data that does not require strong data consistency but only eventual consistency. In this way the multi-master approach may not require a high end machine. Google did that many years ago by returning the search results from thousands of commidity hardware within fraction of seconds. We should be able to follow the same approach if we design the data flow correctly.

Peer 1
======

The advantages of a multi-master concept are higher availability and potentially faster server responses. This would allow us to continue operating our database for the foreseeable future.

On the other hand, this approach brings a number of disadvantages. By having multiple servers that can perform write operations, we get increasing system complexity and communication latency. This limits the scalability of this approach. Another disadvantage is that we lose consistency in our database. What does this mean for our application?

Another approach would be to try to use read-only replicas first. This might be sufficient for the near future and is easy to implement. This would allow us to avoid the complexity of a multi-master approach. In the long term, we can move to a BASE type database. The biggest change here is that we lose consistency, but we lose that anyway.

Peer 2
======

As I learned from the lecture, it is hard to absolutely for or against the Multi-Master. It should depend on:
1) what kind of data we are going to store
2) Does the data size increase continuously
3) tolerance trade-off between budget and stability/consistency
 
Once paid, problem solved
Expanding the size of singer master instance is quite expensive, but if the data size might keep stable and does not increase in the future, it is a good solution to keep the consistency and stability,  

Permanent Solution but more work involved
If we are facing continuously increase of data size, then we need to consider another permanent solution --  multi-master. It is cheaper but it increases the risk of over-communication and overboarding between masters. Maybe we could use/think about an engine which manages the source-usage/communication/transaction/jobs between different masters, like Spark for hdfs, to make the database more performant.


Peer 3
======
The only tenable decision is to choose to switch to a multi-master implementation.  If your organization really is reaching the limits of its relational database application then that alone means that a scaleable solution must be chosen.  In the real world, the fact that expanding the size of the single-master instance is quite expensive would likely be a decisive factor.  The obligation of a long-term contract per size expansion (and assuming a successful business this is likely) is a non-mutually-exclusive additional reason to chose to change to a multi-master.

The lower cost of the multi-master change is also a telling factor is the decision.  The fact that commodity hardware(presumably hardware that already exists and is commonly in use for databases) can be used is an additional non-mutually-exclusive reason to choose to go to multi-master

If the stated assumptions prove to be incorrect then it may be necessary to continue to use single-master application.  If in reality it turns out that you can't go to multi-master then to stay in business you have to go back to the more expensive alternative.

Peer 4
======
A Multi-Master setup has multiple nodes of a database cluster accepting the write requests. It typically depends on the client to pick a Master node, or an abstracted proxy may choose one. As soon as a Master node gets the write request, it accepts it and applies the changes. Once the update is made on one Master, it is propagated to all other Master nodes of the cluster asynchronously, making the data eventually consistent.

Peer 5
======
The decision on whether to move to a multi-master approach really depends on several factors.  Some of these are listed below:

The expected growth of the demands on the proposed system over the next 5-10 years.
The projected rate of growth of data increase, and processing requirements over the next 5-10 years
The type of business i.e. This affects the how vital is the data consistency. A bank or financial institution would require strong adherence to ACID.  A business with more static data could use a more eventual consistency
Pro - After thinking it over, assuming that the "The multi-master approach is much less expensive and can use commodity hardware" is a valid assumption. I believe that over a 10-year time period the master-master system approach would be best.  As a consultant, I've found that this approach is favored by large clients, for example the City and County of Denver. The use of multiple master databases and replica (slave databases) increase the efficiency of the system by basically dividing CRUD between the master and replica databases.  In addition, entities such as the government, have a strong IT department that can support this approach and have the resources to configure and deploy such a system.  Once the initial system is confiqured and deployed, additional master and replicas can be added as needed

Con- The major disadvantages are the time and resources required to configure and deploy a master-master system. For smaller entities, a simple master-slave (replica), might be best. This approach is best for environments where the rate of growth of the demands on the system is relatively low and stable.


Week 2


On ACID and BASE Architectures
The topic for this week is to compare, contrast, and explore the interplay between ACID and BASE style architectures. In this mini-paper (3-5 paragraphs) explain your take or views on these competing approaches to database architecture. Feel free to bring prior experience that you might have into the conversation. It is quite OK to take and support a position that is different from the position in this week's lectures. Let's see what we can learn from talking and listening to each other's perspectives. The main goal is to have a conversation.

Don't take off points for little mistakes. Feel free to make suggestions if there are small mistakes. Please keep your comments positive and useful. Provide comments that will help us all think about the question that is posed. Some of the points will be given by the instructors - it might take a while for the instructor points to appear.

answer
======

BASE model allows database instance to spread and replicating with its cluster and so data are basically available but data may not b immediately be consistent when updated that enters its soft state. Although immediate consistency cannot be guaranteed, eventually data will be consistent and even before that the data can be accessed which allows highest availablility.

ACID model enforces a transaction which is a sequence of operation is atomic and all operations will be all executed or none will be executed. Data will be always consistent and following the constraints enforced by the schema.

The choice between ADID and ACID model has no definite answer. For cases where an approximate results are fine and user expects low response time, BASE is usually a better option. When user expect accurate result based on the consistency of data, we should give ACID databsae a go.

There are some examples that BASE model maybe favored. This includes processing unstructured data for insight, figuring data patterns, handling data that involes individuals hobbies or minds, reporting dynamic status like hybrid types of network traffic or public health statistics.

There are some examples that ADIC model maybe favored. This includes financial transaction, scientific research, marking or scoring systems.


peer 1
======
I dont need any ACID nor BASE database to tell that this is a direct copy-pasted answer from https://neo4j.com/blog/acid-vs-base-consistency-models-explained/.

peer 2
======
We want to compare ACID and BASE databases to get a better understandig of their strengths and weaknesses, and when to choose which. To do this, we look at a number of selected aspects: Organization of the data, data replication, and the consistency of the data. A brief conclusion follows at the end.

Organization of the data: In ACID databases, data is organized in rows and columns. This has the advantage that we can formulate strong constraints on the data and thus efficiently perform operations on the data. In contrast, in BASE databases, the data is organized into documents that contain key-value pairs (it looks like a Python dictionary). This provides greater freedom in the form of our data, at the cost of unpredictability in the runtime of operations.

Data replication: In ACID systems we try to avoid data replication. Therefore, we create lookup tables for text columns and refer to the values in this tables with integer keys. In the case of BASE databases, we accept replication of data, which is forced at a certain point by the size or scale of the application. The disadvantage is the higher demand for storage space.

Consistency: In ACID databases, great efforts are made to ensure data consistency. At the cost of vertical scaling. In BASE systems, on the other hand, we accept a certain level of inconsistency to allow better vertical scaling of the system.

In summary, if we assume ACID databases as the default choice, we have seen that BASE databases become superior if the number of accesses and the size of the data are large enough.

peer 3
======
I dont need any ACID nor BASE database to tell that this is a direct copy-pasted answer from https://phoenixnap.com/kb/acid-vs-base#:~:text=The%20ACID%20database%20transaction%20model,(e.g.%2C%20data%20warehousing).

peer 4
======
First of, the ACID model provides a consistent system while the BASE model provides high availability. Main principals of ACID is: Atomicity, Consistency, Isolation and durability. So we can suppose that this type of database is highly normalized, on other hand we have BASE. BASE stands for Basically-Available-soft serve - Eventual consistent. There`s no answer which type is better because all of them have their own pros and cons. Collections of operations that form a single logical unit of work are called transactions and the database system must ensure proper execution of transactions and the ACID database transaction model ensures that a performed transaction is always consistent. The rise in popularity of NoSQL databases provided a flexible and fluidity with ease to manipulate data and as a result, a new database model BASE was designed, reflecting these properties. 

peer 5
======
It is not possible to give a straight answer to the question of which database model is better. Therefore, a decision must be reached by considering all the aspects of the project.

Given their highly structured nature, ACID-compliant databases will be a better fit for those who require consistency, predictability, and reliability.

Those who consider growth to be among their priorities will likely want to choose the BASE model, because it enables easier scaling up and provides more flexibility. However, BASE also requires developers who will know how to deal with the limitations of the model.

Week 3


Assignment 1
pip install 'elasticsearch<7.14.0'


sadasd
as
a
das
ad
das
da
das
d
asd
a
s
da


end

沒有留言:

張貼留言

2023 Promox on Morefine N6000 16GB 512GB

2023 Promox on Morefine N6000 16GB 512GB Software Etcher 100MB (not but can be rufus-4.3.exe 1.4MB) Proxmox VE 7.4 ISO Installer (1st ISO re...