13 RE Module
A: What is a regular?
A regular is a way to describe a character or string by combining symbols with special meanings, called regular expressions. Or, the regular is the rule used to describe a class of things. (in Python) it is embedded in Python and is implemented through the RE module. The regular expression pattern is compiled into a sequence of bytecode, which is then executed by a matching engine written in C.
There are regular things in life:
For example we describe: 4 legs
You might think of a four-legged animal or a table, a chair, etc.
Continue description: 4 legs, live
It's just a four-legged animal.
Second: Common matching mode (meta-character)
http://blog.csdn.net/yufenghyc/article/details/51078107
# ================================= Matching mode ================================= #一对一的匹配 # ' hello '. replace (old,new) # ' Hello '. Find (' pattern ') #正则匹配import re#\w with \wprint (Re.findall (' \w ', ' Hello Egon 123 ') #[' h ', ' e ', ' l ', ' l ', ' o ', ' e ', ' g ', ' o ', ' n ' ', ' 1 ', ' 2 ', ' 3 ']print (Re.findall (' \w ', ' Hello Egon 123 ')) #[', ']#\s with \sprint (Re.findall (' \s ', ' Hello Egon 123 ')) #[' ', ', ', ', ']print (Re.findall (' \s ', ' Hello Egon 123 ')) #[' h ', ' e ', ' l ', ' l ', ' o ', ' e ', ' g ', ' o ', ' n ', ' 1 ', ' 2 ', ' 3 ']# \ t is all empty, can be \s match print (Re.findall (' \s ', ' hello \ n egon \ T 123 ')) #[', ' \ n ', ' ', ', ' \ t ', ']#\n and \tprint (Re.findall (R ') \ n ', ' Hello Egon \n123 ')) #[' \ n ']print (re.findall (R ' \ t ', ' Hello egon\t123 ')) #[' \ n ']#\d with \dprint (Re.findall (' \d ', ' Hello Egon 123 ')) #[' 1 ', ' 2 ', ' 3 ']print (Re.findall (' \d ', ' Hello Egon 123 ')) #[' h ', ' e ', ' l ', ' l ', ' o ', ' ', ' e ', ' g ', ' O ', ' n ', ']#\a with \zprint (Re.findall (' \ahe ', ' Hello Egon 123 ') #[' he '],\a==>^print (Re.findall (' 123\z ', ' Hello Egon 123 ') ) #[' he '],\z==>$#^ with $print (Re.findall (' ^h ', ' helLo Egon 123 ')) #[' H ']print (Re.findall (' 3$ ', ' Hello Egon 123 ')) #[' 3 ']# repeat match: |. | * | ? | .* | .*? | + | {n,m} |#.print (Re.findall (' a.b ', ' a1b ')) #[' a1b ']print (Re.findall (' a.b ', ' a1b a*b a B aaab ')) #[' a1b ', ' a*b ', ' a B ', ' AAB '] Print (Re.findall (' a.b ', ' a\nb ')) #[]print (Re.findall (' a.b ', ' a\nb ', re. S)) #[' A\nb ']print (Re.findall (' a.b ', ' a\nb ', re. Dotall)) #[' A\NB '] the same meaning as #*print (Re.findall (' ab* ', ' bbbbbbb ')) #[]print (Re.findall (' ab* ', ' a ')) #[' A ']print ( Re.findall (' ab* ', ' abbbb ')) #[' abbbb ']#?print (re.findall (' ab ', ' a ')) #[' A ']print (Re.findall (' ab ', ' abbb ')) #[' AB '] #匹配所有包含小数在内的数字print (Re.findall (' \d+\.? \d* ', "asdfasdf123as1.13dfa12adsf1asdf3")) #[' 123 ', ' 1.13 ', ' 12 ', ' 1 ', ' 3 ']#.* default to greedy match print (Re.findall (' a.*b ', ' a1b22222222b ') #[' a1b22222222b ']#.*? For non-greedy matching: it is recommended to use print (Re.findall (' a.*?b ', ' a1b22222222b ') #[' a1b ']#+print ( Re.findall (' ab+ ', ' a ')) #[]print (Re.findall (' ab+ ', ' abbb ')) #[' abbb ']#{n,m}print (Re.findall (' ab{2} ', ' abbb ')) #[' ABB ']print (Re.findall (' ab{2,4} ', ' abbb ')) #[' ABB ']print (Re.findall (' Ab{1,} ', ' ABBB ') # ' Ab{1,} ' ===> ' ab+ ' Print (Re.findall (' ab{0,} ', ' abbb ')) # ' ab{0,} ' ===> ' ab* ' #[]print (Re.findall (' a[1*-]b ') , ' a1b a*b A-B ') is a normal character in #[], and if-not turned, it should be placed at the beginning or end of [] print (Re.findall (' a[^1*-]b ', ' a1b a*b A-b a=b ') ' ^ represents the meaning of inversion, So the result for the [' A=b ']print (Re.findall (' a[0-9]b ', ' a1b a*b A-b a=b ') #[] within the ^ represents the meaning of inversion, so the result is [' A=b ']print (Re.findall (' a[a-z]b '), ' A1B A*b A-B a=b AEB ') (#[]) ^ represents the meaning of inversion, so the result for [' A=b ']print (Re.findall (' a[a-za-z]b ', ' a1b '-b a*b aeb AEB ')) in the ^ represents the meaning of taking Counter, so the result for [' A=b ']#\# print (Re.findall (' a\\c ', ' a\c ')) #对于正则来说a \\c does match to a\c, but when the Python interpreter reads a\\c, it escapes and then goes to re to execute, So throw the exception print (Re.findall (R ' a\\c ', ' a\c ')) #r代表告诉解释器使用rawstring, that is, the native string, all the symbols in our regular character are treated as ordinary characters, do not escape print (Re.findall (' A\ \\\c ', ' a\c ') #同上面的意思一样, as above results are [' a\\c ']# (): Group print (Re.findall (' ab+ ', ' ababab123 ')) #[' ab ', ' ab ', ' AB ']print ( Re.findall (' (AB) +123 ', ' ababab123 ')) #[' ab '), matching the Abprint (Re.findall (' (?: AB) +123 ', ' ababab123 ') to the end of the ab123) # The result of the findall is not the whole content of the match, but the contents of the group.: You can make the result match all the contents of print (Re.findall (' href= ' (. *) "', ' <a href= ' http://www.Baidu.com "> Click </a> ') #[' http://www.baidu.com ']print (Re.findall (' href= ' (?:. *?)" ', ' <a href= "/http/ Www.baidu.com "> Click </a> ') #[' href=" http://www.baidu.com "']#|print (Re.findall (' Compan (?: y|ies) ', ' Too many Companies gone bankrupt, and the next one are my company ')
# ===========================re module provides an introduction to ===========================import Re#1print (Re.findall (' e ', ' Alex make Love ') #[' e ', ' e ', ' e '], returning all results satisfying the matching criteria, placed in the list #2print (Re.search (' e ', ' Alex Make Love '). Group ()) #e, only to find the first match and then return an object containing matching information , the object can get a matching string by calling the group () method, or None if the string does not match. #3print (Re.match (' e ', ' Alex Make Love ') #None, with search, but at the beginning of the string, you can use search+^ instead of Match#4print (Re.split (' [ab] ', ' ABCD ')) #[', ' ', ' CD ', First press ' a ' to split ' and ' BCD ', then ' and ' BCD ' separately by ' B ' split #5print (' ===> ', re.sub (' A ', ' a ', ' Alex Make Love ') #=== > Alex make love, do not specify N, default to replace all print (' ===> ', re.sub (' A ', ' a ', ' Alex Make Love ', 1)) #===> Alex make loveprint (' = = = > ', re.sub (' A ', ' a ', ' Alex Make Love ', 2)] #===> Alex make Loveprint (' ===> ', re.sub (' ^ (\w+) (. *?\s) (\w+) (. *?\s) (\ w+) (. *?) $ ', R ' \5\2\3\4\1 ', ' Alex Make Love ') #===> love Make Alexprint (' ===> ', re.subn (' A ', ' a ', ' Alex Make Love ')) #===> ( ' Alex Make Love ', 2, results with a total number of replacements #6obj=re.compile (' \d{2} ') Print (Obj.search (' abc123eeee '). Group ()) #12print ( Obj.findall (' abc123Eeee ')) (#[' 12 '), with the reuse of obj
Import reprint (Re.findall (? < (? p<tag_name>\w+) >\w+</(? P=tag_name) > ","
Import Reprint (Re.findall (R '-?\d+\.? \d* ', "1-12* (60+ ( -40.35/5)-( -4*3))") #找出所有数字 [' 1 ', ' -12 ', ', ' -40.35 ', ' 5 ', '-4 ', ' 3 '] #使用 |, first matching the Mister Effect, | The left is a matching decimal, and findall the end result is to look at the grouping, all even if the matching success of the decimal will not be stored in the result # instead of the decimal, to match (-?\d+), the match to the nature is, the number of non-decimals, here is the integer print (Re.findall (r "-?\d+\.\d*| ( -?\d+) "," 1-2* (60+ ( -40.35/5)-( -4*3))) #找出所有整数 [' 1 ', '-2 ', ' 60 ', ' ', ' 5 ', '-4 ', ' 3 ']
#计算器作业参考: Http://www.cnblogs.com/wupeiqi/articles/4949995.htmlexpression= ' 1-2* ((60+2* ( -3-40.0/5) * (9-2*5/3+7/3* 99/4*2998+10*568/14))-( -4*3)/(16-3*2)) ' Content=re.search (' \ ([\-\+\*\/]*\d+\.? \d*) +\ ', expression). Group () # ( -3-40.0/5)
#为何同样的表达式search与findall却有不同结果:p rint (Re.search (' \ ([\+\-\*\/]*\d+\. \d*) +\) ', "1-12* (60+ ( -40.35/5)-( -4*3))") (Group ()) # ( -40.35/5) print (Re.findall (' \ ([\+\-\*\/]*\d+\.? \d*) +\ ', "1-12* (60+ ( -40.35/5)-( -4*3))") #['/5 ', ' * * ' #看这个例子:(\d) + equivalent (\d) (\d) (\d) (\d) ..., is a series of group print (Re.search (' (\d) + ', ' 123 '). Group ()) #group的作用是将所有组拼接到一起显示出来print (Re.findall (' (\d) + ', ' 123 ')) #findall结果是组内的结果 and is the result of the last group
#_ *_coding:utf-8_*___author__ = ' Linhaifeng ' #在线调试工具: tool.oschina.net/regex/#import res= "' http://www.baidu.com[ email protected] Hello 010-3141 ' #最常规匹配 # content= ' Hello 123 456 world_this is a Regex Demo ' # res=re.match (' hello\s\d\d \d\s\d{3}\s\w{10}.*demo ', content) # print (res) # print (Res.group ()) # Print (Res.span ()) #泛匹配 # content= ' Hello 123 456 World_this is a Regex Demo ' # res=re.match (' ^hello.*demo ', content) # Print (Res.group ()) #匹配目标, get the specified data # content= ' Hello 123 456 World_this is a Regex Demo ' # res=re.match (' ^hello\s (\d+) \s (\d+) \s.*demo ', content) # Print (Res.group ()) #取所有匹配的内容 # Print (Res.group (1)) #取匹配的第一个括号内的内容 # Print (Res.group (2)) #去陪陪的第二个括号内的内容 # greedy match:. * represents match as many characters as # import re# content= ' Hello 123 456 World_this is a Regex Demo ' # # Res=re.match (' ^he.* (\d+). *demo$ ', content) # Print (Res.group (1)) #只打印6 because. * will match as many as possible And then followed by at least one number # non-greedy match:? Match as few characters as possible # import re# content= ' Hello 123 456 world_this is a Regex Demo ' # # Res=re.match (' ^he.*? ( \d+). *demo$ ', content) # Print (Res.group (1)) #只打印6, because. * will match as many as possible, followed by at least oneNumber # match pattern: cannot match line break content= ' ' Hello 123456 world_thisis a Regex Demo ' # res=re.match (' he.*? ( \d+). *? demo$ ', content) # Print (res) #输出None # res=re.match (' he.*? ( \d+). *? Demo$ ', Content,re. S) #re. S Jean. Can match line break # print (res) # print (Res.group (1)) #转义: # content= ' Price was $5.00 ' # res=re.match (' Price is $5.00 ', content) # Print (res) # # Res=re.match (' Price is \$5\.00 ', content) # Print (res) #总结: try to be as concise as possible using the pan-match pattern as much as possible. * # try to use non-greedy mode:. *? # Use parentheses to get the matching target: use Group (n) to get results # with a newline character with re. S: Modify mode #re.search: Will scan the entire string, will not start from scratch, find the first matching result will return # import re# content= ' Extra strings Hello 123 456 world_this is a Regex Dem o Extra Strings ' # # Res=re.match (' hello.*? ( \d+). *? Demo ', content) # Print (res) #输出结果为None # # import re# content= ' Extra strings Hello 123 456 world_this is a Regex Demo Extra s Trings ' # # Res=re.search (' hello.*? ( \d+). *? Demo ', content) # # print (Res.group (1)) #输出结果为 #re.search: Just one result, match walkthrough, import recontent= "<tbody><tr id=" 4766303201494371851675 "class=" even "><td><div class=" HD "><span class=" num">1</span><div class=" RK "><span class=" U-icn u-icn-75 "></span></div></div> </TD><TD class= "Rank" ><div class= "F-CB" ><div class= "tt" ><a href= "/song?id=476630320" ></a><span data-res-id=" 476630320 "" # Res=re.search (' <a\shref=.*?<b\stitle ="(.*?)". *?b> ', content) # Print (Res.group (1)) #re. FindAll: Find all results matching the criteria # Res=re.findall (' <a\shref=.*?<b\stitle= ' (. *) ". *?b> ', content) # for I in res:# print (i) #re. Sub: string substitution import recontent= ' Extra strings Hello 123 456 World_this is a R Egex Demo Extra Strings ' # content=re.sub (' \d+ ', ', content) # Print (content) #用 \1 get the contents of the first parenthesis # usage: Swap 123 and 456 for position # import re# Content= ' Extra strings Hello 123 456 world_this is a Regex Demo Extra strings ' # # # content=re.sub (' (extra.*?) (\d+) (\s) (\d+) (. *?strings) ', R ' \1\4\3\2\5 ', content) # content=re.sub (' (\d+) (\s) (\d+) ', R ' \3\2\1 ', Content] # print (content) # import re# content= ' Extra strings Hello 123 456 world_this is a Regex Demo Extra Strings ' # # Res=re.search (' Extra.*? ( \d+). *strings ', content) # Print (Res.group (1)) # import requests,re# respone=requests.get (' https://book.douban.com/') . text# print (respone) # print (' ====== ' *1000) # print (' ====== ' *1000) # print (' ====== ' *1000) # print (' ====== ' *1000) # res= Re.findall (' <li.*?cover.*?href= ' (. *?) "). *?title= "(. *?)" >.*?more-meta.*?author "> (. *?) </span.*?year "> (. *?) </span.*?publisher "> (. *?) </span.*?</li> ', Respone,re. S) # # Res=re.findall (' <li.*?cover.*?href= ' (. *?) "). *?more-meta.*?author "> (. *?) </span.*?year "> (. *?) </span.*?publisher "> (. *?) </span>.*?</li> ', Respone,re. S # # # for I in res:# print ('%s%s%s '% (I[0].strip (), I[1].strip (), I[2].strip (), I[3].strip ()))
Python Regular expression