tongsiying

阅读|运动|自律

0%

正则表达式

## re模块

re.match

最常规的匹配

1
2
3
4
5
6
import re
content = 'Hello 123 4567 World_This is a Regex Demo'
result = re.match('^Hello\s\d\d\d\s\d{4}\s\w{10}.*Demo$',content)
print(result)
print(result.group())
print(result.span())
<_sre.SRE_Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(0, 41)

泛匹配

1
2
3
4
5
6
import re
content = 'Hello 123 4567 World_This is a Regex Demo'
result = re.match('^Hello.*Demo$',content)
print(result)
print(result.group())
print(result.span())
<_sre.SRE_Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(0, 41)

匹配目标

1
2
3
4
5
6
import re
content = 'Hello 1234567 World_This is a Regex Demo'
result = re.match('^Hello\s(\d+)\sWorld.*Demo$',content)
print(result)
print(result.group(1))
print(result.span())
<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This is a Regex Demo'>
1234567
(0, 40)

贪婪匹配

1
2
3
4
5
6
import re
content = 'Hello 123 4567 World_This is a Regex Demo'
result = re.match('^He.*(\d+).*Demo$',content)
print(result)
print(result.group(1))
print(result.span())
<_sre.SRE_Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
7
(0, 41)

非贪婪匹配

1
2
3
4
5
6
import re
content = 'Hello 1234567 World_This is a Regex Demo'
result = re.match('^He.*?(\d+).*Demo$',content)
print(result)
print(result.group(1))
print(result.span())
<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This is a Regex Demo'>
1234567
(0, 40)

匹配模式

1
2
3
4
5
import re
content = """Hello 1234567 World_This
is a Regex Demo"""
result = re.match('^He.*?(\d+).*?Demo$',content,re.S)
print(result.group(1))
1234567

转义

1
2
3
4
import re
content = 'price is $5.00'
result = re.match('^price is $5.00$',content)
print(result)
None
1
2
3
4
import re
content = 'price is $5.00'
result = re.match('^price is \$5\.00$',content)
print(result)
<_sre.SRE_Match object; span=(0, 14), match='price is $5.00'>

总结:尽可能多的使用泛匹配,使用括号得到匹配目标,尽量使用非贪婪模式,有换行符使用re.S

re.search扫描整个字符串,然后返回第一个成功的匹配

1
2
3
4
import re
content = 'Extra strings Hello 1234567 World_This is a Regex Demo Extra strings'
result = re.match('Hello.*?(\d+).*Demo',content)
print(result)
None
1
2
3
4
import re
content = 'Extra strings Hello 1234567 World_This is a Regex Demo Extra strings'
result = re.search('Hello.*?(\d+).*Demo',content)b
print(result)
<_sre.SRE_Match object; span=(14, 54), match='Hello 1234567 World_This is a Regex Demo'>

总结:为匹配方便,能用re.rearch就不用re.match

re.findall

搜索字符串,以列表形式匹配所有能匹配的字串

re.sub

替换字符串中每一个匹配的字符串后返回匹配的字符串

1
2
3
4
import re
content = 'Extra strings Hello 1234567 World_This is a Regex Demo Extra strings'
result = re.sub('\d+','',content)
print(result)
1
2
3
4
import re
content = 'Extra strings Hello 1234567 World_This is a Regex Demo Extra strings'
result = re.sub('\d+','replacement',content)
print(result)
Extra strings Hello replacement World_This is a Regex Demo Extra strings
1
2
3
4
import re
content = 'Extra strings Hello 1234567 World_This is a Regex Demo Extra strings'
result = re.sub('(\d+)',r'\1 8901',content)
print(result)
Extra strings Hello 1234567 8901 World_This is a Regex Demo Extra strings

re.compile

讲一个正则表达式串编译成正则表达式对象,以便于复用该匹配模式

1
2
3
4
5
6
7
8
import re
content = '''Hello 1234567 World_This
is a Regex Demo'''
pattern = re.compile('^He.*Demo',re.S)
result = re.match(pattern,content)
result1 = re.match('He.*Demo',content,re.S)
print(result)
print(result1)
<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This\nis a Regex Demo'>
<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This\nis a Regex Demo'>

爬取豆瓣读书

1
2
3
4
5
6
import requests
import re
content = requests.get('https://book.douban.com/').text
pattern = re.compile('<li.*?cover.*?href="(.*?)".*?title="(.*?)".*more-meta.*?author">(.*?)</span>.*?year">(.*?)</span>.*?</li>',re.S)
results = re.search(pattern,content)
print(results)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import requests
import re
content = requests.get('https://book.douban.com/').text


#print (content)
pattern = re.compile(r'<li.*?cover.*?href="(.*?)"\stitle="(.*?)".*?more-meta.*?author">(.*?)</span>.*?year">(.?)</span>.?</li>', re.S)##这里应该要在URL和title之间匹配空格,用\s
results = re.findall(pattern, content)


#print(results)
for result in results:
url , name , author , date = result
name = re.sub('\s','',name)
author = re.sub('\s','',author)
date = re.sub('\s','',date)
print(url,name,author,date)
1
2
3
4
5
6
7
8
9
10
11
import requests
import re
content = requests.get('https://book.douban.com/').text
pattern = re.compile('<li.*?cover.*?href="(.*?)".*?title="(.*?)".*?more-meta.*?author">(.*?)</span>.*?year">(.*?)</span>.*?</li>',re.S)
results = re.findall(pattern,content)
print(results)
for result in results:
url,name,author,date = result
author = re.sub('\s','',author)
date = re.sub('\s','',date)
print(url,name,author,date)
1
2
3
4
5
6
7
8
9
10
11
12
import requests 
import re
content = requests.get('https://book.douban.com/').text
pattern =re.compile('<li.*?cover.*?href="(.*?)".*?title="(.*?)".*?more-meta.*?author">(.*?)</span>.*?year">(.*?)</span>.*?publisher">(.*?)</span>.*?abstract"(.*?)</p>.*?</li>',re.S)
results = re.findall(pattern,content)
for result in results:
url,name,author,year,publisher,abstract=result
name = re.sub('\s','',name)
author = re.sub('\s', '', author)
year = re.sub('\s', '', year)
publisher = re.sub('\s', '', publisher)
print(url,name,author,year,publisher,abstract)
赞赏一下吧~