12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485 |
#!/usr/bin/python # -*- coding: utf-8 -*- """ Function: [Solved] beautifulsoup has obtained the Unicode soup, but the print output is garbled. http://www.crifan.com/beautifulsoup_already_got_unicode_soup_but_print_messy_code Author: Crifan Li Version: 2013-05-30 Contact: http://www.crifan.com/contact_me/ """ import re,urllib2 from BeautifulSoup import BeautifulSoup from urllib import urlopen def scrapeW3school(): html = urllib2.urlopen( "http://www.w3school.com.cn/html/html_tables.asp" ); # Soup = beautifulsoup (HTML); # the effect of this sentence is the same: # The actual test result is: fromencoding is not added, and it can be automatically and correctly (to determine whether the original character encoding is gb2312, and then) parsed (then the Unicode soup ). soup = BeautifulSoup(html, fromEncoding = "GB2312" ); #print "soup=",soup; allTdSoup = soup.findAll( "td" ); print "type(allTdSoup)=" , type (allTdSoup); # Type (alltdsoup) = <class 'beotiulsoup. result'>, but it is actually a list print "len(allTdSoup)=" , len (allTdSoup); # Len (alltdsoup) = 32. The list length here is 32. print "allTdSoup=" ,allTdSoup; # allTdSoup= [<td>row 1, cell 1</td>, <td>row 1, cell 2</td>, <td>row 2, ......, <td><a href="/tags/tag_tfoot.asp"><tfoot></a></td> #, <TD> Why? /TD>, <TD> <a href = "/tags/tag_col.asp"> & lt; Col & gt; </a> </TD>, <TD> too many threads have been written. # € €С €? /TD>, <TD> <a href = "/tags/tag_colgroup.asp"> & lt; colgroup & gt; </a> </TD>, <TD> Why? /TD>] # Here, it looks garbled, but in fact, the alltdsoup obtained here is a list, and each soup in it, although the internal encoding is normal Unicode # But it will still print out garbled characters, because: #1. First read the explanation on the official website: #http://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html # "When you call _ STR __, pretencode or rendercontents, you can specify the output encoding. The default encoding (STR uses) is the UTF-8. " # So: # Here, if alltdsoup is printed, that is, a soup list is printed. Therefore, if every soup in the list (which is essentially an object) is output as a string, the _ STR _ attribute is called by default. # So it is equivalent: # For each soup in alltdsoup: # Call the _ STR _ of the soup to obtain the corresponding string (the content of the soup) # The final combination outputs the results you see, such as ["XXX", "XXX, # "XXX" indicates the result of each soup. _ STR _. # Here, the value of _ STR _ for each Soup: # As described on the official website, the default is the UTF-8 code # So the string obtained here is a UTF-8-encoded string, # Print output to cmd # CMD is GBK encoded # So, The UTF-8 encoding characters, in GBK cmd display, it shows garbled # Where: # (1) if you are not familiar with cmd GBK, go to the following link: # Set the character encoding: Simplified Chinese GBK/English #http://www.crifan.com/files/doc/docbook/soft_dev_basic/release/html/soft_dev_basic.html#cmd_encoding # (2) if for GBK, The UTF-8 itself does not understand, see: # Detailed description of character encoding #http://www.crifan.com/files/doc/docbook/char_encoding/release/html/char_encoding.html # (3) for soup itself, it is actually unicode encoding, so you can specify the encoding when _ STR _ output is GBK, as stated on the official website, so that non-garbled Chinese characters are correctly displayed here for eachTdSoup in allTdSoup: print "type(eachTdSoup)=" , type (eachTdSoup); # Type (eachtdsoup) = <type 'instance'>, indicating that the instance type is beautifulsoup print "eachTdSoup.string=" ,eachTdSoup.string; # Output the string attribute of soup, that is, the part of the string content in the tag, which is Unicode. Therefore, non-garbled Chinese characters can be normally output. print "type(eachTdSoup.string)=" , type (eachTdSoup.string); # Note that the Unicode type is not here, but: type (eachtdsoup. String) = <class 'beautifulsoup. navigablestring'> print "eachTdSoup=" ,eachTdSoup; # Directly output soup itself, so equivalent to: eachtdsoup. _ STR _ = eachtdsoup. _ STR _ ("UTF-8"), so when encountering Chinese is garbled print "eachTdSoup.renderContents()=" ,eachTdSoup.renderContents(); # Direct output content itself, the default is also used is UTF-8, so when encountering Chinese is also garbled print "eachTdSoup.__str__(‘GBK‘)=" ,eachTdSoup.__str__( ‘GBK‘ ); # The GBK encoding is specified, so non-garbled Chinese characters can be displayed normally. # Extract some of the output: # type(eachTdSoup)= <type ‘instance‘> # eachTdSoup.string= row 1, cell 1 # type(eachTdSoup.string)= <class ‘BeautifulSoup.NavigableString‘> # eachTdSoup= <td>row 1, cell 1</td> # eachTdSoup.renderContents()= row 1, cell 1 # eachTdSoup.__str__(‘GBK‘)= <td>row 1, cell 1</td> # ...... # type(eachTdSoup)= <type ‘instance‘> # Eachtdsoup. String = defines the group of table columns. # type(eachTdSoup.string)= <class ‘BeautifulSoup.NavigableString‘> # Eachtdsoup = <TD> Why? /TD> # Eachtdsoup. rendercontents ( # Eachtdsoup. _ STR _ ('gbk') = <TD> defines the group of table columns. </TD> # # (4) In addition, for beautifulsoup, you can guess Its Encoding Based on the charset in HTML. If you do not know it, see: # [Finishing] On the HTML web page source code character encoding (charset) format (gb2312, GBK, UTF-8, ISO8859-1, etc.) Interpretation #http://www.crifan.com/summary_explain_what_is_html_charset_and_common_value_of_gb2312_gbk_utf_8_iso8859_1 if __name__ = = "__main__" : scrapeW3school(); |