1.
[Code]The most basic grasping station
?
12 |
import urllib2 content = urllib2.urlopen( ‘http://XXXX‘ ).read() |
2.
[Code]Using a proxy server
?
12345 |
import urllib2 proxy_support = urllib2.ProxyHandler({ ‘http‘ : ‘http://XX.XX.XX.XX:XXXX‘ }) opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) urllib2.install_opener(opener) content = urllib2.urlopen( ‘http://XXXX‘ ).read() |
3.
[Code]When you need to log in:
?
123456789101112131415161718192021222324252627282930313233343536373839404142434445 |
#1 cookie的处理
import urllib2, cookielib
cookie_support
= urllib2.HTTPCookieProcessor(cookielib.CookieJar())
opener
= urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)
content
= urllib2.urlopen(
‘http://XXXX‘
).read()
#2 用代理和cookie
opener
= urllib2.build_opener(proxy_support, cookie_support, urllib2.HTTPHandler)
#3 表单的处理
import urllib
postdata
=
urllib.urlencode({
‘username‘
:
‘XXXXX‘
,
‘password‘
:
‘XXXXX‘
,
‘continueURI‘
:
‘http://www.verycd.com/‘
,
‘fk‘
:fk,
‘login_submit‘
:
‘登录‘
})
req
= urllib2.Request(
url
= ‘http://secure.verycd.com/signin/*/http://www.verycd.com/‘
,
data
= postdata
)
result
= urllib2.urlopen(req).read()
#4 伪装成浏览器访问
headers
= {
‘User-Agent‘
:
‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6‘
}
req
= urllib2.Request(
url
= ‘http://secure.verycd.com/signin/*/http://www.verycd.com/‘
,
data
= postdata,
headers
= headers
)
#5 反”反盗链”
headers
= {
‘Referer‘
:
‘http://www.cnbeta.com/articles‘
}
|
4.
[Code]Multithreading concurrent fetching
?
1234567891011121314151617181920212223242526272829 |
from threading
import Thread
from Queue
import Queue
from time
import sleep
#q是任务队列
#NUM是并发线程总数
#JOBS是有多少任务
q
= Queue()
NUM
= 2
JOBS
= 10
#具体的处理函数,负责处理单个任务
def do_somthing_using(arguments):
print arguments
#这个是工作进程,负责不断从队列取数据并处理
def working():
while True
:
arguments
= q.get()
do_somthing_using(arguments)
sleep(
1
)
q.task_done()
#fork NUM个线程等待队列
for i
in range
(NUM):
t
= Thread(target
=
working)
t.setDaemon(
True
)
t.start()
#把JOBS排入队列
for i
in range
(JOBS):
q.put(i)
#等待所有JOBS完成
q.join()
|
Report
Some tips for catching a station with Python crawlers