This method extracts Youku's album ID, and then cyclically collects webpage code by ID to extract the title tag and VID. There is no technical content. =... =
The HttpWebRequest and HttpWebResponse classes in. NET are applied in the collection. Regular expressions are used for code analysis.
This code is not very efficient. The resolution time for a webpage is between 0.5 and ~ Within 2 seconds, not suitable for massive collection. It may be faster to convert it to JavaScript.
So much research has been done for the moment, and the code is directly sent out for sharing.
Code VB. NET, create a form frmMain, add a TextBox, A ListBox, two buttons, and copy the following code:
Copy codeThe Code is as follows:
Imports System. Net
Imports System. IO
Imports System. Text
Imports System. Text. RegularExpressions
Public Class frmMain
Structure VList
Dim id As Integer
Dim title As String
Dim vid1 As String
Dim vid2 As String
Overloads Function ToString () As String
Return String. Format ("{0 }:< {1}> [{2}]", id, title, vid1)
End Function
End Structure
Dim myList As New List (Of VList)
Private Sub button#click (ByVal sender As System. Object, ByVal e As System. EventArgs) Handles Button1.Click
'Prevent repeated variable Creation
Dim wr1 As HttpWebRequest
Dim wr2 As HttpWebResponse
Dim ret As String
Dim reg As Match
Dim g As Group
Dim preVid As String = "" 'previous VID
Dim nowid As Integer = 0' current number of video sets
Dim listUrl As String = TextBox1.Text 'get album URL, such As http://www.youku.com/playlist_show/id_2350764.html
Dim tarUrl As String = "http://v.youku.com/v_playlist/f00000}" '{0} ListID
Reg = Regex. Match (listUrl, "playlist_show/id _ (\ d +). * \. html ")
If Not reg. Success Then
MsgBox ("album list extraction failed! ")
Exit Sub
End If
G = reg. Groups (1)
TarUrl = String. Format (tarUrl, g. Value) & "o%1%p%0%.html" '{0} number of sets {1} sorting
Wr1 = HttpWebRequest. Create (TextBox1.Text)
Wr2 = wr1.GetResponse
Ret = New StreamReader (wr2.GetResponseStream, Encoding. GetEncoding (wr2.CharacterSet). ReadToEnd
Reg = Regex. Match (ret, "<title> (. +)-album-youku video </title> ")
If Not reg. Success Then
MsgBox ("album name extraction failed! ")
Else
G = reg. Groups (1)
MsgBox ("album name:" & g. Value & "")
End If
Do
'Retrieve the page text from the Web stream
Wr1 = HttpWebRequest. Create (String. Format (tarUrl, nowid, "0") 'searches videos in reverse order.
Wr2 = wr1.GetResponse
Ret = New StreamReader (wr2.GetResponseStream, Encoding. GetEncoding (wr2.CharacterSet). ReadToEnd
'Textbox2. Text = ret
'Create a temporary video list variable
Dim nlist As New VList
Nlist. id = nowid 'Get ID
'Get videoId
Reg = Regex. Match (ret, "var \ s + videoId \ s * = \ s *" "(\ d +)" "\ s *;")
If Not reg. Success Then Exit Do
G = reg. Groups (1)
'If the VID is equal to the last VID, the last VID exits.
If g. Value = preVid Then Exit Do
Nlist. vid1 = g. Value
'Get videoId2
Reg = Regex. match (ret, "var \ s + videoId2 \ s * = \ s *" "(\ w | =) +)" "\ s *;") '"var \ s + videoId2 \ s * = \ s *" "(\ w +)" "\ s *;")
If Not reg. Success Then Exit Do
G = reg. Groups (1)
Nlist. vid2 = g. Value
'Retrieve the title
Reg = Regex. Match (ret, "<title> (. +)-(. +)-video-youku video-watch online-</title> ")
If Not reg. Success Then
Nlist. title = "{name search error }"
Else
G = reg. Groups (2)
Nlist. title = g. Value
End If
'Finishing the work
MyList. Add (nlist) 'is added to the total list.
PreVid = nlist. vid1 'record the last VID
Wr2.Close ()
Me. Text = nowid & ": Processing complete! "
Nowid + = 1
Loop
Wr2.Close ()
MsgBox (nowid & "videos are all collected and processed! ")
Button2_Click (sender, e)
End Sub
Private Sub Button2_Click (ByVal sender As System. Object, ByVal e As System. EventArgs) Handles Button2.Click
ListBox1.Items. Clear ()
For Each ls As VList In myList
ListBox1.Items. Add (String. Format ("{0 }:< {1}> [{2}]", ls. id, ls. title, ls. vid1 ))
Next
MyList. Clear ()
End Sub
End Class
Night smell original
Blog: http://clso.cnblogs.com
Home: http://cleclso.cn
QQ: 315514678 E-mail: clso # qq.com
Technical Exchange is welcome!