Tools:PostgreSQL
Operating environment:Ubantu 14.0
Data source:tpc-h Dbgen
TASK1:
1. Download PostgreSQL (sudo install PostgreSQL), the results are as follows.
TASK2:
1. Follow the tutorial and go to the Dbgen folder to modify the makefile file.
Then execute make to use tpc-h to generate the data. Here you can see the front 5 points of http://www.cnblogs.com/joyeecheung/p/3599698.html.
In addition, Tpc-h's diagram file can be found on the official website: http://www.tpc.org/tpc_documents_current_versions/pdf/tpch2.17.1.pdf
2. Convert the TBL format file to a JSON format file.
1) code: The main use of C + + file read and write to complete, read the specified tbl file, and according to tpch official
The web describes the data model to convert, write json files, examples for converting orders.tbl files.
Of course, in order to think fast, it is directly a table conversion, you can also directly to all the files at the same time conversion operation.
#include<iostream>
#include<fstream>
#include<string>
#include<vector>
using namespace std;
vector<string> split(string s, string c) {
string::size_type pos;
vector<string> result;
int size = s.size();
for (int i = 0; i < size; i++) {
pos = s.find(c, i);
if(pos < size) {
result.push_back(s.substr(i, pos-i));
i = pos + c.size() -1;
}
}
return result;
}
int main() {
string s;
int i, j;
ifstream in;
in.open("orders.tbl");
ofstream out;
out.open("orders.json");
string title[9] = { "\"orderkey\"","\"custkey\"","\"orderstatus\"",
"\"totalprice\"","\"orderdate\"","\"orderpriority\"",
"\"clerk\"","\"shippriority\"","\"comment\""};
if (in.is_open()) {
for (i = 1; getline(in, s); i++) {
vector<string> result = split(s, "|");
/*if (i == 3270184 ){
cout << "!!!" << s << endl;
break;
}*/
s = "\t{";
if (result.size() == 9) {
out << i;
for (j = 0; j < 8; j++) {
s = s + title[j] + ":\"" + result[j] + "\",";
}
s = s + title[j] + ":\"" + result[j] + "\"}\n";
out << s;
}
}
out.close();
}
}
2) Effect: My computer is going to be blown . jpg
TASK3:
1. Create the database and the corresponding table, and import the converted JSON format file into it.
2. Start the query.
1 "learn before starting a query sql query the knowledge, especially to know its choice of table, the principle of spelling, such as select "the execution order of the statements, from
2) to figure out three data entities for this database customer,part,supplier And the relationship between them, so that you can use the appropriate method to query the results of the query you want.
Q1:display the key and quantity of parts which is shipped in March 13th, 1996.
Query Statement (SQL):
select data->>‘partkey‘ as part, data->>‘quantity‘ as quantity
from l
where data->>‘shipdate‘ = ‘1996-03-13‘;
Query Result:
Q2:for Each ship mode, the. nd the total quantity of items is shipped before
Query Statement (SQL):
select sum(cast(data->>‘quantity‘ as numeric)) as total, data->>‘shipmode‘ as shipmode
from l
where data->>‘shipdate‘ < ‘1998-12-01‘
group by data->>‘shipmode‘
order by total asc;
Query Result:
Q3:display The total quantity of parts which be satis?ed the following conditions in Lineitem:–the ship mode is air–t He priority of the parts belong to be URGENT
Query Statement (SQL):
select sum(cast(data->>‘quantity‘ as numeric)) as total
from l
where data->>‘shipmode‘=‘AIR‘ and data->>‘orderkey‘ in (select data->>‘orderkey‘ from o where data->>‘orderpriority‘=‘1-URGENT‘);
Query Result:
Q4:
Query Statement (SQL):
select data->>‘name‘ as name, data->>‘nationkey‘ as nation,
data->>‘regionkey‘ as region
from n
where data->>‘nationkey‘ in (select s.data->>‘nationkey‘ from s, ps where s.data->>‘suppkey‘=ps.data->>‘suppkey‘ and ps.data->>‘partkey‘=‘100‘ and ps.data->>‘partkey‘ < ps.data->>‘suppkey‘);
Query Result:
Q5:
Query Statement (SQL):
select count(distinct data->>‘custkey‘)
from c
where data->>‘custkey‘ in (select o.data->>‘custkey‘
from o, l
where o.data->>‘orderkey‘ = l.data->>‘orderkey‘ and l.data->>‘partkey‘ in (select data->>‘partkey‘
from p
where data->>‘type‘ like ‘%STEEL%‘));
Query Result:
Q6:
Query Statement (SQL):
(Method 1, efficiency is slow ):
select data->>‘partkey‘ as key, data->>‘name‘ as name, data->>‘brand‘ as brand,
(select sum(cast(l.data->>‘quantity‘ as numeric)) as total from l
where l.data->>‘shipdate‘ like ‘%1997%‘ and l.data->>‘partkey‘ = p.data->>‘partkey‘
group by l.data->>‘partkey‘)
from p
order by total desc limit 10;
(Method 2, Efficiency is faster):
select p.data->>‘partkey‘ as key, p.data->>‘name‘ as name, p.data->>‘brand‘ as brand, s.total
from p, (select l.data->>‘partkey‘ as partkey, sum(cast(l.data->>‘quantity‘ as numeric)) as total
from l
where l.data->>‘shipdate‘ like ‘%1997%‘
group by l.data->>‘partkey‘
order by total desc
limit 10
) as s
where p.data->>‘partkey‘ = s.partkey;
Query Result:
Method1and Methods2The main difference is the first choiceTenDo you spell the table or do you choose the line again, the method2The results can be obtained in more than 10 seconds, the method1takes a longer time. Of course, I actually have a law.0, directly theLand thePmake a spelling list and choose ... Enter the query before bedtime, woke up without a result (seven or eight head)= =.
Q7:
Query Statement (SQL):
select c.data->>‘custkey‘ as customer, c.data->>‘name‘ as name,
c.data->>‘address‘ as address, c.data->>‘phone‘ as phone,
op.count as count
from c, (select o.data->>‘custkey‘ as cust, count(*) as count
from o
group by o.data->>‘custkey‘
having count(*) >= 10) as op
where c.data->>‘custkey‘ = op.cust;
Query Result:
Q8:
Query Statement (SQL):
Method 1(inspired by Q6 , the first of the descending selection is the maximum value)
select cast(data->>‘extendedprice‘ as numeric) as maxprice
from l, (select data->>‘orderkey‘ as orderkey from o
where data->>‘orderdate‘ like ‘%1998%‘) as op
where l.data->>‘orderkey‘ = op.orderkey
order by maxprice desc
limit 1;
Method 2(normal thinking, aggregate function with sum applied)
select max(cast(lp.price as numeric)) as maxextendedprice
from (select l.data->>‘extendedprice‘ as price from l where l.data->>‘orderkey‘ in (select o.data->>‘orderkey‘
from o
where o.data->>‘orderdate‘ like ‘%1998%‘)) as lp;
Query Result:
It can be found that the results of these two methods are the same as the efficiency, the intuitive feeling is similar, if the database is sorted by heap, it is true that the efficiency of the two methods is the same as the algorithm. (in particular , it is important to note that when using sorting, the data is coerced to the numeric type, otherwise it is the text format and the result of the comparison is 9999...). )
Q9:
Query Statement (SQL):
select sum(cast(l.data->>‘quantity‘ as numeric)) as total, l.data->>‘shipmode‘ as mode
from l
where l.data->>‘orderkey‘ in (select o.data->>‘orderkey‘ from o
where o.data->>‘orderdate‘ like ‘1995-05%‘) and l.data->>‘shipmode‘ <> ‘TRUCK‘ and l.data->>‘shipmode‘ <> ‘RAIL‘
group by l.data->>‘shipmode‘;
Query Result:
Experience:
1. The installation process is completely followed by the tutorial, without encountering any problems.
2. In fact, this homework is very simple, but at the beginning I very blindly began to do homework, have not made clear the principle, encountered what attributes to convert what file and import into the database. Then the data were all erased, first understand tpc-h sql statements, with particular emphasis on the order in which statements are executed, because the group by having where
3. The query when my mode is basically: to meet the xx and xx xx. xx xx ... Find the subject before you find the table.
4.SQL Query response is slow when the data is very large ... Then listen to the classmate said to set up an index to query, just a few seconds of things ...
"Database" PostgreSQL and SQL Primary use (tpc-h data application)