//一個表有一個ID欄位,是主鍵並且是遞增的,現在需要一條SQL把這個表沒有的ID查出來
//例如ID的記錄如下:
ID
1
2
4
5
7
9
//我們SQL需要把3,6,8這三個值查出來
//這是一個典型的求集合的差集問題:
with ta as(
select 1 id from dual union all
select 2 from dual union all
select 4 from dual union all
select 5 from dual union all
select 7 from dual union all
select 9 from dual)
select level id
from dual
connect by level<=(select max(id)-min(id)+1 from ta)
minus
select id from ta
/
ID
----------
3
6
8
//對於此問題,這裡ta中資料很小量的,如果ta的最後一個元素為1000001呢?
//那麼怎麼樣的查詢才算高效呢?
//求集合的差集還有其他方法嗎?
//下面我們來看看類似情況(求連續遞增的數中沒有出現的數)的差集怎麼的:
create table t as
select 1 id from dual union all
select 2 from dual union all
select 3 from dual union all
select 4 from dual union all
select 456 from dual union all
select 10145 from dual union all
select 1044653 from dual
/
//方法一:使用minus
select count(*)
from (
select level id
from dual
connect by level<=(select max(id)-min(id)+1 from t)
minus
select id from t)
/
COUNT(*)
----------
1044646
Executed in 1.547 seconds
//方法二:使用not exists
select count(*)
from (
select *
from (
select level id
from dual
connect by level<=(select max(id)-min(id)+1 from t)) a
where not exists(
select 1
from t b
where a.id=b.id))
/
COUNT(*)
----------
1044646
Executed in 2.157 seconds
//方法三:使用not in
select count(*)
from (
select *
from (
select level id
from dual
connect by level<=(select max(id)-min(id)+1 from t)) a
where a.id not in(
select b.id
from t b
where a.id=b.id))
/
COUNT(*)
----------
1044646
Executed in 8.39 seconds
//從這裡看出,在處理大量資料時,相比於exists,not in(in)的效率是相當低的
//所以建議在應用中要盡量使用exists,少用in
//因為in要進行元素匹配,對比,而exists只需要判斷存在性即可
//方法四:使用lag()分析函數
select count(*)
from (
with temp as(
select s,e
from (
select lag(id) over(order by id)+1 s,id-1 e from t)
where e - s >= 0)
select a.s + b.rn -1 h
from temp a,
(
select rownum rn
from (select max(e-s+1) gap from temp)
connect by rownum <= gap) b
where a.s + b.rn-1 <= a.e
order by 1)
/
COUNT(*)
----------
1044646
Executed in 10.313 seconds
SQL> set time on;
7:11:37 SQL> /
COUNT(*)
----------
1044646
Executed in 10.156 seconds
7:11:50 SQL>
//此方法效率最低,因為我們這裡用到了lag()分析函數,max(),min(),以及with暫存資料表,
//在一個查詢中,使用的函數越多,就會增加Oracle的負擔,
//所以oracle engine在查理此查詢時,也需要額外的開銷(時間和資源)