1. Introduction
take the home of Mechatronics website as an example
after the preliminary analysis, the electrical and mechanical Home data volume is large, and the data organization rules are more unified, suitable for code-based full-amount crawling. Enterprise List URL Unified http://www.jdzj.com/yp_vlist_1_1.html mode, the change is only the numbers, and the enterprise's Yellow pages are:http: //
2. Bulk access to Yellow Pages address:
#!/bin/Bash
#Trap, Snap to signal, 2 = Ctrl + Ctrap"exec 6>&-;exec 6<&-;exit 0" 2
#创建 < famous pipes >mkfifo testfifoexec6<>Testfiform-RF Testfifo
#设置线程数 Thread= #指定线程数 for((n=0;n< $Thread; n++)) DoEcho>&6Done#设置计时器, time stamp at start of record
Seconds_1=$ (Date +%s)
#开始获取黄页地址列表 forIinch{1.. the}; DoJ= ' Curl-i-S http://www.jdzj.com/yp_vlist_{$i}_1.html |grep lblpage |awk ' {match ($0,/<b>[0-9]*<\/b>/);p rint substr ($ rstart+3,rlength-7)} " forNinch' Seq1$j '; #批量获取子域名并筛选去重 DoRead-U6 Echo"threads: ' Ps-ef |grep $0|grep-v grep|grep-v vim|wc-l '" #输出当前线程数Echo"completed: ' Cat url.txt|wc-l '" #输出当前获取的黄页地址数Echo"######################"{Curl-i-s http://www.jdzj.com/yp_vlist_{$i}_{$n}.html |awk ' {match ($0,/http:\/\/[a-za-z0-9]+.jdzj.com/);p rint substr ($0,rstart , rlength)} ' | Sort | Uniq|sed-e '/^$/d ' >> url.txtEcho >&6 }&Donedone Wait
#设置计时器, timestamp at the end of the record
seconds_2 =$ (Date +%
" Time : ' Expr $seconds _2-$seconds _1 ' "
6>&-
6<&-
0
3. Get the subdomain name and extract the company information in turn
#!/bin/bash
Trap "exec 6>&-;exec 6<&-;exit 0" 2
Mkfifo Testfifo
EXEC 6<>testfifo
RM-RF Testfifo
Thread=32
For ((n=0;n< $Thread; n++))
Do
Echo >&6
Done
seconds_1=$ (Date +%s)
Touch Dd.txt
V1= ' Cat url.txt|wc-l '
For n in ' seq 1 $v 1 '
Do
Read-u6
echo "Threads: ' Ps-ef |grep $0|grep-v grep|grep-v vim|wc-l '"
echo "Completed: ' Cat dd.txt|grep phone |wc-l '"
echo "######################"
{
Sed-n ${n}p url.txt |xargs curl-i-s|egrep-a6 lblhtc | Sed ' s/[a-z{.<> ' =_:\/}]//g ' >t_$n.txt
L= ' Cat t_$n.txt|wc-l '
if [[$l-eq 7]]
Then
I=1
While Read line
Do
j= ' expr $i% 8 '
Case $j in
1)
Echo-n $line | awk ' {print$1 ' "} ' >>t_2_$n.txt
;;
2)
;;
3)
Echo-n $line | awk ' {print$1$3 ' "} ' >>t_2_$n.txt
;;
4)
Echo-n $line | awk ' {print$pn ' "} ' >>t_2_$n.txt
;;
5)
Echo-n $line | awk ' {print$1$3$4$6 ' "} ' >>t_2_$n.txt
;;
6)
Echo-n $line | awk ' {print$1$3$4$6 ' "} ' >>t_2_$n.txt
;;
7)
Echo-n $line | awk ' {print$1$3 ' "} ' >>t_2_$n.txt
ECHO-E-------------------------->>t_2_$n.txt
Cat T_2_$n.txt >> Dd.txt
;;
Esac
i= ' expr $i + 1 '
Done <t_$n.txt
Fi
RM-RF T_2_$n.txt
RM-RF T_$n.txt
Echo >& 6
}&
Done
Wait
seconds_2=$ (Date +%s)
echo "Time: ' Expr $seconds _2-$seconds _1 '"
EXEC 6>&-
EXEC 6<&-
Exit 0
Shell Crawler Simple Script (the number of threads can be controlled)