Example of web crawler in python core programming, python core programming Crawler
1 #!/usr/bin/env python 2 3 import cStringIO # 4 import formatter # 5 from htmllib import HTMLParser # We use various classes in these modules for parsing HTML. 6 import httplib # We only need an exception from this module 7 import os # This provides various file system functions 8 import sys # We are just using argv for command-line arguments 9 import urllib # We only need the urlretrieve()function for downloading Web pages 10 import urlparse # We use the urlparse()and urljoin()functions for URL manipulation 11 12 class Retriever(object): 13 __slots__ = ('url','file') 14 15 def __init__(self,url): 16 self.url, self.file = self.get_file(url) 17 18 def get_file(self, url, default='index.html'): 19 'Create usable local filename from URL' 20 parsed = urlparse.urlparse(url) # ParseResult(scheme='http', netloc='www.baidu.com', path='', params='', query='', fragment='') 21 host = parsed.netloc.split('@')[-1].split(':')[0] # 'www.baidu.com' 22 filepath = '%s%s' % (host,parsed.path) # 'www.baidu.com' 23 if not os.path.splitext(parsed.path)[1]: # '' 24 filepath = os.path.join(filepath, default) # 'www.baidu.com\\index.html' 25 linkdir = os.path.dirname(filepath) # 'www.baidu.com' 26 if not os.path.isdir(linkdir): # False 27 if os.path.exists(linkdir): # False 28 os.unlink(linkdir) 29 os.makedirs(linkdir) # make a directory named by link directory on the hard disc 30 return url, filepath 31 32 def download(self): 33 'Download URL to specific name file' 34 try: 35 retval = urllib.urlretrieve(self.url, self.file) 36 except (IOError, httplib.InvalidURL) as e: 37 retval = (('*** ERROR:bad URL "%s": %s' % (self.url,e)),) 38 return retval 39 40 def parse_links(self): 41 'Parse out the links found in downloaded HTML file' 42 f = open(self.file, 'r') 43 data = f.read() 44 f.close() 45 parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(cStringIO.StringIO()))) 46 parser.feed(data) 47 parser.close() 48 return parser.anchorlist 49 50 class Crawler(object): 51 count = 0 # the number of objects downloaded from the internet 52 53 def __init__(self, url): 54 self.q = [url] # a queue of links to download 55 self.seen = set() # a set containing all the links that we have seen(downloaded) already 56 parsed = urlparse.urlparse(url) 57 host = parsed.netloc.split('@')[-1].split(':')[0] 58 self.dom = '.'.join(host.split('.')[-2:]) # 'b.a.i.d.u' 59 60 def get_page(self, url, media=False): 61 'Download page & parse links, add to queue if nec' 62 r = Retriever(url) 63 fname = r.download()[0] # 'www.baidu.com\\index.html' 64 if fname[0] == '*': # 'w' 65 print fname, '... skipping parse' 66 return 67 Crawler.count += 1 # 1 68 print '\n(', Crawler.count, ')' # (1) 69 print 'URL:', url # URL: http://www.baidu.com 70 print 'FILE:', fname # FILE: www.baidu.com\\index.html 71 self.seen.add(url) # set(['http://www.baidu.com']) 72 ftype = os.path.splitext(fname)[1] # '.html' 73 if ftype not in ('.htm', '.html'): # False 74 return 75 76 for link in r.parse_links(): 77 if link.startswith('mailto:'): # False 78 print '... discarded, mailto link' 79 continue 80 if not media: # False 81 ftype = os.path.splitext(link)[1] 82 if ftype in ('.mp3','.mp4','.m4v','.wav'): 83 print '... discarded, media file' 84 continue 85 if not link.startswith('http://'): # False 86 link = urlparse.urljoin(url, link) 87 print '*', link, 88 if link not in self.seen: # True 89 if self.dom not in link: # False 90 print '... discarded, not in domain' 91 else: 92 if link not in self.q: 93 self.q.append(link) 94 print '... new, added to Q' 95 else: 96 print '... discarded, already in Q' 97 else: 98 print '... discarded, already processed' 99 100 def go(self, media=False):101 'Process next page in queue (if any)'102 while self.q:103 url = self.q.pop()104 self.get_page(url, media)105 106 def main():107 if len(sys.argv) > 1:108 url = sys.argv[1]109 else:110 try:111 url = raw_input('Enter starting URL:')112 except(KeyboardInterrupt, EOFError):113 url = ''114 if not url:115 return116 if not url.startswith('http://') and not url.startswith('ftp://'):117 url = 'http://%s/' % url118 robot = Crawler(url)119 robot.go()120 121 if __name__ == '__main__':122 main()
What is the focus of python learning? With the second version of python core programming, how can this book be used better?
Python is the fourth-generation programming language. It has a low entry threshold and a quick start.
At the beginning, to cultivate interest, you can write some small script examples, such as batch file processing and regular expression matching;
Other network applications can also be involved.
To be more in-depth, you need to understand object-oriented.
The core programming book is a collection of online python Forum results, more practical, you can learn from examples.
Then you can look for popular python frameworks and python open-source projects to see the source code.
But in general, python has a bottleneck in efficiency. Generally, it plays the role of script cohesion and batch processing in large systems.
In the example at the beginning of chapter 2 of Python core programming, what is the appending of the open function mode? Why is the print content not displayed?
What do you want to do?
If you want
'Fatal error: invalid input! '
If this string is written to a file, you must use the 'W' parameter in the first sentence to create a file. You cannot create a file directly using 'A'. You can only open an existing file and add new content to it.
If the program runs smoothly, A mylog.txt file appears under the folder you specified, containing the content you print in.
The role of B is used together with the pickle module. It is recommended that you check the pickle part in the python manual. After reading it, you will know the role of B.