1 fromUrllib.requestImportUrlopen2 fromBs4ImportBeautifulSoup3 ImportRe4 ImportRandom5 Importdatetime6 7Pages =set ()8 Random.seed (Datetime.datetime.now ())9 Ten #Gets the table of all the inner chains of the current page One defgetinternallinks (bsobj,includeurl): AInternallinks = [] - #Find all links that start with "/" - forLinkinchBsobj.findall ("a", href = Re.compile ("^(/|. *"+includeurl+")")): the iflink.attrs['href'] is notNone: - iflink.attrs['href'] not inchinternallinks: -Internallinks.append (link.attrs['href']) - + returninternallinks - + #Gets the table for all the outer chains of the current page A defgetexternallinks (bsobj,excludeurl): atExternallinks = [] - #Find all links that start with ' http ' and ' www ' and do not contain the current URL - forLinkinchBsobj.findall ("a", -Href=re.compile ("^ (http|www) ((?!"+excludeurl+").) *$")): - iflink.attrs['href'] is notNone: - iflink.attrs['href'] not inchexternallinks: inExternallinks.append (link.attrs['href']) - to returnexternallinks + - defsplitaddress (address): theAddressparts = Address.replace ("/ http",""). Split ("/") * returnAddressparts $ Panax Notoginseng - defgetnextexternallink (param): the Pass + A the defGetrandomexternallink (startingpage): +HTML =Urlopen (startingpage) -Bsobj =BeautifulSoup (HTML) $Externallinks =getexternallinks (Bsobj, Splitaddress (Startingpage) [0]) $ ifLen (externallinks) = =0: -Internallinks =getinternallinks (startingpage) - returnGetnextexternallink (internallinks[random.randint (0, theLen (internallinks)-1)]) - Else:Wuyi returnExternallinks[random.randint (0, Len (externallinks)-1)] the - deffollowexternalonly (startingsite): WuExternallink = Getrandomexternallink ("http://oreilly.com") - Print("the random outer chain is:"+Externallink) About followexternalonly (Externallink) $ -Followexternalonly ("http://oreilly.com")
"Python Network data Collection" Reading book chapter III: Start collecting