Linux Shell 網頁抓取

來源:互聯網
上載者:User
 args.txt
#! /bin/bash if [ -z $1 ] || [ ! -e $1 ]  thenecho "Usage: cmd.sh input "exitfiecho $0for num in $*;do     echo "$num"done for i in $(seq -3 $#);     do           echo $i     done for i in {0..5} do     echo $i done echo $@   for((i=4;i<7;i++));do echo $i done   echo "all:$$"  trimReg="s/\(^ *\)\(.*[^ ]\)\( *$\)/\2/"  tmpfile=`cat /proc/sys/kernel/random/uuid`     while read line; do     value=${line#*=}     key=${line%%=*}     key=`echo ${key}|sed -e "${trimReg}"`     value=`echo ${value}|sed -e "${trimReg}"`     if [ "$key" == "url" ]     then             url=$value     elif [ "$key" == "beginwith" ]     then             beginwith=$value     elif [ "$key" == "endwith" ]     then              endwith=$value     elif [ "$key" == "pagereg" ]     then              pagereg=$value     elif [ "$key" == "savepath" ]     then             savepath=$value  elif [ "$key" == "prefix" ] then   prefix=$valueelif [ "$key" == "proxy" ]thenproxy=$value    fi   done < $1  echo "url:$url"echo "beginwith:$beginwith"echo "pagereg:$pagereg"echo "endwith:$endwith"echo "prefix:$prefix"echo "proxy:$proxy"echo "savepath:$savepath"echo "tmpfile:$tmpfile"if [ -z $proxy ]thencontent=`curl -s $url | iconv -f gbk -t utf-8`elsecontent=`curl -x $proxy -s $url | iconv -f gbk -t utf-8`fi length=`expr length "${content}"` echo "download:$length byte(s)"content=${content#*${beginwith}} content=${content%%${endwith}*}length=`expr length "${content}"` echo "after filer:$length byte(s)"echo $content|grep -Po "$pagereg"|uniq > $savepathawk '{a[$0]++}END{for(m in a) print m}' $savepath > $tmpfileif [ ! -z $prefix ]thensed "s/^/$prefix/g" $tmpfile > $savepathelsecp $tmpfile $savepathfirm -f $tmpfile  str="0000012345456789000000"echo $str#str= expr substr $str 1 2 #str=${str:2:3} str=${str#*0} echo $str#trim the string str="  s =  "str=`echo $str | sed -e "${trimReg}"`  echo [$str] echo $str | sed -e "${trimReg}"

 

url = focus.news.163.combeginwith = <ul class="focuslist-1" id="focusTab-1">pagereg = (?<=href=\\")http://focus\\.news\\.163\\.com/[\\d]+.+?(?=\\")endwith =  <div class="con-4" area clearfix">savepath = 163.txt

 

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.