Lyrics crawlers and folk lyrics Crawlers
Because a conversation and chat system requires a large amount of corpus, I decided to use the lyrics as training data. I wrote a crawler and crawled the lyrics of about songs;
Use this lyrics as a Q & A pair, and then use the LSTM-QA model for Q & A matching, after many experiments, to achieve a good effect, basically can chat with you normally;
import reimport urllibimport urlparsefrom BeautifulSoup import BeautifulSoupurl = u'http://www.lrcgc.com/'def find_singers(): singers_list = [] response = urllib.urlopen('http://www.lrcgc.com/artist-00.html') data = response.read() soup = BeautifulSoup(data) links = soup.findAll('a', href = re.compile(r'songlist.*.html')) for link in links: s = link.text l = link['href'] singers_list.append([s, l]) return singers_listdef find_songs(singer): singer_name, urls_0 = singer[0], singer[1] songs_href = [] songs_list = [urls_0] song_list_old = [] while len(songs_list) >0: url_i = songs_list.pop() song_list_old.append(url_i) response = urllib.urlopen(url+url_i) data = response.read() soup = BeautifulSoup(data) songs_list_links = soup.findAll('a', href = re.compile(r'songlist.*.html')) for link in songs_list_links: if link['href'] not in song_list_old: if link['href'] not in songs_list: songs_list.append(link['href']) songs_href_list = soup.findAll('a', href = re.compile(r'lyric-.*.html')) for link in songs_href_list: songs_href.append(link['href']) return list(set(songs_href))dic = {}for singer in singers_list: try: ss = find_songs(singer) print singer[0].encode('utf-8') + '\t' + str(len(ss)) dic[singer[0]] = ss except: continuedef parse_song_href(singer, song_url): complete_url = url + song_url response = urllib.urlopen(complete_url) data = response.read() soup = BeautifulSoup(data) name = soup.findAll('a', id = 'J_downlrc')[0]['href'] download_url = url + name try: content = urllib.urlopen(download_url.encode('utf-8')).read() with open('./' + name.encode('utf-8').split('/')[1], 'w') as f: f.write(content) return download_url except: return Falsefor singer_name in dic.keys(): for song_url in dic[singer_name]: print parse_song_href(singer_name, song_url)