[Author]: Kwu
Based on the Sparksql call shell script run Sql,sparksql provides an option similar to-e,-F,-I in Hive
1. Timed Call Script
#!/bin/sh # Upload logs to HDFs yesterday= ' date--date= ' 1 days ago ' +%y%m%d '/opt/modules/spark/bin/spark-sql-i/opt /bin/spark_opt/init.sql--master spark://10.130.2.20:7077--executor-memory 6g--total-executor-cores SPARK.UI.PORT=4075-E "Insert Overwrite table st.stock_realtime_analysis PARTITION (dtype= ') ' Select T1.stockid as S Tockid, t1.url as URL, t1.clickcnt as clickcnt, 0, round (t1.clickcnt/(Case t2.clic Kcntyesday is null then 0 else T2.clickcntyesday end)-1) * 2) as lpcnt, ' ' as type, t1.analysis _date as Analysis_date, T1.analysis_time as Analysis_time from (select Stock_code stockid, conc At (' http://stockdata.stock.hexun.com/', Stock_code, '. shtml ') URL, count (1) clickcnt, subst R (From_unixtime (Unix_timestamp (), ' Yyyy-mm-dd HH:mm:ss '), 1,10) analysis_date, substr (From_unixtime (unix_tim Estamp (), ' Yyyy-mm-dd HH:mm:ss '), 12, 8) Analysis_time from dms.tracklog_5min where stock_type = ' stock ' and day = substr (From_unixtime (Unix_timestamp (), ' YyyyMMdd '), 1, 8) group by Stock_code ORDER by clickcnt D ESC limit) T1 LEFT join (select Stock_code stockid, COUNT (1) clickcntyesday from Dms.tracklog_5min a where Stock_type = ' stock ' and substr (datetime, 1, ten) = Date_sub (From_unixtime (unix_time Stamp (), ' Yyyy-mm-dd HH:mm:ss '), 1) and substr (datetime, 5) <substr (From_unixtime (Unix_timestamp (), ' Yyyy-mm-dd HH:mm:ss '), 5) and day = ' ${yesterday} ' GROUP by Stock_code) T2 on T1. Stockid = T2.stockid; "Sqoop export--connect jdbc:mysql://10.130.2.245:3306/charts--username guojinlian--password Abcd1234--table sto Ck_realtime_analysis--fields-terminated-by ' \001 '--columns "Stockid,url,clickcnt,splycnt,lpcnt,type"--export-dir /dw/st/stock_realtime_analysis/dtype=01;
init.sql content for onboarding UDF:
Add jar/opt/bin/udf/hive-udf.jar;create temporary function Udtf_stockidxfund as ' Com.hexun.hive.udf.stock.UDTFStockIdxFund '; Create temporary function udf_getbfhourstime as ' Com.hexun.hive.udf.time.UDFGetBfHoursTime '; Create temporary function udf_getbfhourstime2 as ' Com.hexun.hive.udf.time.UDFGetBfHoursTime2 '; Create temporary function udf_stockidxfund as ' Com.hexun.hive.udf.stock.UDFStockIdxFund '; Create temporary function udf_md5 as ' Com.hexun.hive.udf.common.HashMD5UDF '; Create temporary function Udf_murhash as ' Com.hexun.hive.udf.common.HashMurUDF '; Create temporary function Udf_url as ' com.hexun.hive.udf.url.UDFUrl '; create Temporary function Url_host as ' com.hexun.hive.udf.url.UDFHost '; create temporary function udf_ip as ' Com.hexun.hive.udf.url.UDFIP '; Create temporary function udf_site as ' com.hexun.hive.udf.url.UDFSite '; create Temporary function Udf_urldecode as ' com.hexun.hive.udf.url.UDFUrlDecode '; create temporary function Udtf_url as ' Com.hexun.hive.udf.url.UDTFUrl '; Create Temporary function Udf_ua as ' com.hexun.hive.udf.useragent.UDFUA '; create temporary function udf_ssh as ' Com.hexun.hive.udf.useragent.UDFSSH '; Create temporary function Udtf_ua as ' Com.hexun.hive.udf.useragent.UDTFUA '; Create temporary function udf_kw as ' com.hexun.hive.udf.url.UDFKW '; create temporary function Udf_chdecode as ' Com.hexun.hive.udf.url.UDFChDecode ';
Setting the port for the UI
Default 4040, will be in conflict with other running tasks, this change to 4075
Set the memory and CPU resources used by the task
--executor-memory 6g--total-executor-cores 45
The original statement was run with HIVE-E, and the change was faster after spark.
The original is 15min, the speed is 45s after lifting.
Calling shell scripts based on Sparksql to run SQL