Installation
sudo pip install pyquery
Example
from pyquery import PyQueryimport urllib2page = urllib2.urlopen("http://www.lzu.edu.cn")text = unicode(page.read(), "utf-8")doc = PyQuery(text)for event in doc(‘.r li‘): event = PyQuery(event) #loc = event.find(‘.h‘).text() time = event.text().encode(‘utf-8‘) #name = event.find(‘title‘).text() #print ‘name: %s‘ % name print ‘名字 : %s‘ % time #print ‘location : %s‘ % loc print ‘----------------------‘
Note that the event is Unicode, and in memory it must be a fixed 2-byte Unicode that stores the utf-8 to be converted to bytes.
Of course, there are other modules that can also be used, such as
#!/usr/bin/env python#-*-Coding:utf8-*-from htmlparser import htmlparserfrom htmlentitydefs Import Name2codepointimport urllib2class Myhtmlparser (htmlparser): def __init__ (self): htmlparser.__init__ (self) Self._flag = "Def handle_starttag (self, Tag, attrs): if tag = = ' H3 ' and attrs.__contains__ (' class ', ' Event-ti Tle '): Self._flag = ' event-title ' if tag = = ' time ': Self._flag = ' time ' if tag = = ' sp An ' and attrs.__contains__ (' class ', ' Event-location '): Self._flag = ' event-location ' def handle_data (self, Data): if Self._flag = = ' Event-title ': print ' conference name:%s '%data self._flag = ' #if self . _flag = = ' time ': # print ' session:%s '%data if Self._flag = = ' event-location ': print ' meeting place:%s ' %data print '-------------------' self._flag = ' page = Urllib2.urlopen (' Https://www.python.org/even Ts/python-events/'). Read () parser = MyhtmlparSer () parser.feed (page)
References
[1].http://www.douban.com/note/208670234/
[2].http://blog.csdn.net/mindmb/article/details/7898528
[3].http://pythonhosted.org/pyquery/api.html
Use Pyquery to get HTML to specify label content