The preceding section describes how to use Jsoup to parse html. Jsoup is also powerful and convenient. However, when you only need to use a third-party package to parse a small amount of html content, you can use a regular expression to find the matched content. The following is a simple example to crawl the text content of the title label on the page. The Code is as follows:
Package com. home. parsehtml; import java. io. bufferedReader; import java. io. IOException; import java. io. inputStreamReader; import java.net. httpURLConnection; import java.net. URL; import java. util. regex. matcher; import java. util. regex. pattern; import android. app. activity; import android. OS. bundle; import android. util. log; import android. view. view; import android. view. view. onClickListener; import android. widget. button; public class MainActivity extends Activity implements OnClickListener {private Button btn; private static final String URL_STR =" http://vip.astro.sina.com.cn/iframe/astro/view/aries/day/ "; @ Overrideprotected void onCreate (Bundle savedInstanceState) {super. onCreate (savedInstanceState); setContentView (R. layout. main); btn = (Button) findViewById (R. id. main_btn); btn. setOnClickListener (this) ;}@ Overridepublic void onClick (View v) {if (v = btn) {new Thread (r ). start () ;}} Runnable r = new Runnable () {@ Overridepublic void run () {String content = getHtmlContent (); doParse (content );}}; /*** read html ** @ Return */protected String getHtmlContent () {StringBuffer sb = new StringBuffer (); BufferedReader br = null; try {URL url = new URL (URL_STR ); httpURLConnection conn = (HttpURLConnection) url. openConnection (); br = new BufferedReader (new InputStreamReader (conn. getInputStream (), "UTF-8"); String temp; while (temp = br. readLine ())! = Null) {sb. append (temp). append ("\ n") ;}} catch (Exception e) {e. printStackTrace () ;}finally {if (br! = Null) {try {br. close ();} catch (IOException e) {e. printStackTrace () ;}} return sb. toString ();}/*** use regular expressions to extract title Tag content ** @ param content */protected void doParse (String content) {// matchStart,Document Pattern p = Pattern. compile ("([^] *) "); Matcher m = p. matcher (content); if (m. find () {String title = m. group (1); Log. I ("title", title );}}}