import re content = 'Hello 123 4567 World_This is a Regex Demo' result = re.match('^Hello\s\d\d\d\s\d{4}\s\w{10}.*Demo$',content) print(result) print(result.group()) print(result.span())
<_sre.SRE_Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(0, 41)
泛匹配
1 2 3 4 5 6
import re content = 'Hello 123 4567 World_This is a Regex Demo' result = re.match('^Hello.*Demo$',content) print(result) print(result.group()) print(result.span())
<_sre.SRE_Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(0, 41)
匹配目标
1 2 3 4 5 6
import re content = 'Hello 1234567 World_This is a Regex Demo' result = re.match('^Hello\s(\d+)\sWorld.*Demo$',content) print(result) print(result.group(1)) print(result.span())
<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This is a Regex Demo'>
1234567
(0, 40)
贪婪匹配
1 2 3 4 5 6
import re content = 'Hello 123 4567 World_This is a Regex Demo' result = re.match('^He.*(\d+).*Demo$',content) print(result) print(result.group(1)) print(result.span())
<_sre.SRE_Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
7
(0, 41)
非贪婪匹配
1 2 3 4 5 6
import re content = 'Hello 1234567 World_This is a Regex Demo' result = re.match('^He.*?(\d+).*Demo$',content) print(result) print(result.group(1)) print(result.span())
<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This is a Regex Demo'>
1234567
(0, 40)
匹配模式
1 2 3 4 5
import re content = """Hello 1234567 World_This is a Regex Demo""" result = re.match('^He.*?(\d+).*?Demo$',content,re.S) print(result.group(1))
1234567
转义
1 2 3 4
import re content = 'price is $5.00' result = re.match('^price is $5.00$',content) print(result)
None
1 2 3 4
import re content = 'price is $5.00' result = re.match('^price is \$5\.00$',content) print(result)
<_sre.SRE_Match object; span=(0, 14), match='price is $5.00'>
总结:尽可能多的使用泛匹配,使用括号得到匹配目标,尽量使用非贪婪模式,有换行符使用re.S
re.search
re.search扫描整个字符串,然后返回第一个成功的匹配
1 2 3 4
import re content = 'Extra strings Hello 1234567 World_This is a Regex Demo Extra strings' result = re.match('Hello.*?(\d+).*Demo',content) print(result)
None
1 2 3 4
import re content = 'Extra strings Hello 1234567 World_This is a Regex Demo Extra strings' result = re.search('Hello.*?(\d+).*Demo',content)b print(result)
<_sre.SRE_Match object; span=(14, 54), match='Hello 1234567 World_This is a Regex Demo'>
总结:为匹配方便,能用re.rearch就不用re.match
re.findall
搜索字符串,以列表形式匹配所有能匹配的字串
re.sub
替换字符串中每一个匹配的字符串后返回匹配的字符串
1 2 3 4
import re content = 'Extra strings Hello 1234567 World_This is a Regex Demo Extra strings' result = re.sub('\d+','',content) print(result)
1 2 3 4
import re content = 'Extra strings Hello 1234567 World_This is a Regex Demo Extra strings' result = re.sub('\d+','replacement',content) print(result)
Extra strings Hello replacement World_This is a Regex Demo Extra strings
1 2 3 4
import re content = 'Extra strings Hello 1234567 World_This is a Regex Demo Extra strings' result = re.sub('(\d+)',r'\1 8901',content) print(result)
Extra strings Hello 1234567 8901 World_This is a Regex Demo Extra strings
re.compile
讲一个正则表达式串编译成正则表达式对象,以便于复用该匹配模式
1 2 3 4 5 6 7 8
import re content = '''Hello 1234567 World_This is a Regex Demo''' pattern = re.compile('^He.*Demo',re.S) result = re.match(pattern,content) result1 = re.match('He.*Demo',content,re.S) print(result) print(result1)
<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This\nis a Regex Demo'>
<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This\nis a Regex Demo'>
#print(results) for result in results: url , name , author , date = result name = re.sub('\s','',name) author = re.sub('\s','',author) date = re.sub('\s','',date) print(url,name,author,date)
1 2 3 4 5 6 7 8 9 10 11
import requests import re content = requests.get('https://book.douban.com/').text pattern = re.compile('<li.*?cover.*?href="(.*?)".*?title="(.*?)".*?more-meta.*?author">(.*?)</span>.*?year">(.*?)</span>.*?</li>',re.S) results = re.findall(pattern,content) print(results) for result in results: url,name,author,date = result author = re.sub('\s','',author) date = re.sub('\s','',date) print(url,name,author,date)
1 2 3 4 5 6 7 8 9 10 11 12
import requests import re content = requests.get('https://book.douban.com/').text pattern =re.compile('<li.*?cover.*?href="(.*?)".*?title="(.*?)".*?more-meta.*?author">(.*?)</span>.*?year">(.*?)</span>.*?publisher">(.*?)</span>.*?abstract"(.*?)</p>.*?</li>',re.S) results = re.findall(pattern,content) for result in results: url,name,author,year,publisher,abstract=result name = re.sub('\s','',name) author = re.sub('\s', '', author) year = re.sub('\s', '', year) publisher = re.sub('\s', '', publisher) print(url,name,author,year,publisher,abstract)