正则表达式是一个特殊的字符序列,描述了一种字符串匹配的模式,可以用来检查一个字符串是否含有某个子串
可以精确匹配和模糊匹配,还可以进行替换、分割等操作
正则具有贪婪性,贪婪性是指在能发生匹配的前提下,尽可能去多的匹配
Python中re模块拥有正则表达式全部功能
正则表达式由普通字符和元字符组成。普通字符包括字母、数字和打印符号,元字符是有特殊含义的字符
常用方法
re.match()
从字符串开头匹配,匹配不到返回None
re.match(r"ab","abc") # <re.Match object; span=(0, 2), match='ab'>
print(re.match(r"ab","cab")) # None
re.search()
匹配整个字符串,直到找到一个匹配
re.search(r"ab","abc")# <re.Match object; span=(0, 2), match='ab'>
>>> re.search(r"ab","cab") # <re.Match object; span=(1, 3), match='ab'>
re.findall()
匹配所有,返回一个列表
re.findall(r"\d+","1a12b123c1234d") # ['1', '12', '123', '1234']
re.findall(r"[a-zA-Z]+","1aa12bb123cccc1234dddd") # ['aa', 'bb', 'cccc', 'dddd']
re.findall("\d+","1a2b3c") # ['1', '2', '3']
re.findall("[a-z](\d+)","1a2b3c") # ['2', '3']
re.findall("([a-z])(\d+)","1a2b3c") # [('a', '2'), ('b', '3')]
re.compile()
编译,使得正则表达式可以复用
pattern = re.compile(r"\d+")
pattern.search("123abc456") # <re.Match object; span=(0, 3), match='123'>
pattern.search("456abc456") # <re.Match object; span=(0, 3), match='456'>
指定起始位置必须用编译后的匹配
p = re.compile(r"abc")
p.match("123abc",3) # <re.Match object; span=(3, 6), match='abc'>
p.match("123abc",3,6) # <re.Match object; span=(3, 6), match='abc'>
pattern.split()
分割
p = re.compile(r"\d+")
relist = p.split("one123two456three")
print(relist) # ['one', 'two', 'three']
s = "a 2 b 2 c 5 d"
p = re.compile(r"\s+\d\s+")
relist = p.split(s)
print(relist) # ['a', 'b', 'c', 'd']
relist = p.split(s,2)
print(relist) # ['a', 'b', 'c 5 d']
re.finditer()
迭代器
for i in re.finditer(r"[A-Za-z]+","one123two456three"):
print(i.group(),end= " ") # one two three
re.sub()
替换
re.sub(r"\d+","**","aa11bb22cc") # 'aa**bb**cc'
re.subn()
替换并返回替换次数
re.subn(r"\d+","**","aa11bb22cc")
('aa**bb**cc', 2)
group()
提取匹配的内容
re.search(r"\d+(\D+)\d+","123abc345").group() # '123abc345'
group
查看分组个数
p = re.compile(r"(\d+)(\w+)")
p.groups # 2
常用符号
.
匹配换行符之外的任意一个字符,匹配不到返空,打印None
import re
re.match(r".","abc") # <re.Match object; span=(0, 1), match='a'>
re.match(r"..","abc") # <re.Match object; span=(0, 2), match='ab'>
re.match(r".","\n") # 匹配不到
print(re.match(r".","\n")) # None
re.match(r".","\n",re.DOTALL) # <re.Match object; span=(0, 1), match='\n'> re.DOTALL表示把.改成可以匹配所有字符
re.match(r"\.",".abc") # <re.Match object; span=(0, 1), match='.'> 只匹配.
re.match(r"\\","\\abc") # <re.Match object; span=(0, 1), match='\\'>
\
转义
re.match(r"\.",".abc") # <re.Match object; span=(0, 1), match='.'> 转义匹配.
re.match(r"\\","\\abc") # <re.Match object; span=(0, 1), match='\\'> 转义匹配\\
[]
匹配括号中任意内容,只匹配一个字符
re.match(r"[abc]","cxxx") # <re.Match object; span=(0, 1), match='c'>
re.match(r"[abc]","xx") # 未匹配到
[^]
匹配非括号中内容
re.match(r"[^abc]","xabcxx") # <re.Match object; span=(0, 1), match='x'>
re.match(r"[^abc]","abcxx") # 未匹配到
^
表示从头开始匹配
re.match(r"^abc","abcxx") # <re.Match object; span=(0, 3), match='abc'>
re.match(r"^abc","xabcxx") # 未匹配到
$
匹配结尾
re.search(r"\d+$","123abc456") # <re.Match object; span=(6, 9), match='456'>
re.search(r"^123$","123") # <re.Match object; span=(0, 3), match='123'>
re.search(r"\A123\Z","123") # <re.Match object; span=(0, 3), match='123'> \A \Z也可以表示开头和结尾
\d
匹配数字
re.match(r"\d","1234") # <re.Match object; span=(0, 1), match='1'>
\D
匹配非数字
re.match(r"\D","d123") # <re.Match object; span=(0, 1), match='d'>
re.match(r"\D+","abcd123") # <re.Match object; span=(0, 4), match='abcd'>
\w
匹配字母数字下划线
re.search(r"\w","a1_") # <re.Match object; span=(0, 1), match='a'>
re.search(r"\w+","aA1_ ") # <re.Match object; span=(0, 4), match='aA1_'>
\W
匹配非字母数字下划线
re.search(r"\W","aA1_- ") # <re.Match object; span=(4, 5), match='-'>
\s
匹配空格
re.search(r"\s","abc def") # <re.Match object; span=(3, 4), match=' '>
re.search(r"\s+","abc\n \tdef") # <re.Match object; span=(3, 6), match='\n \t'>
\S
匹配非空白
re.findall(r"\S+","ab cd\t ef\nhi") # ['ab', 'cd', 'ef', 'hi']
"".join(re.findall(r"\S+","ab cd\t ef\nhi")) # 'abcdefhi'
()
分组
re.search(r"(\d+)(\D+)(\d+)","123abc345").group(0) # '123abc345'
re.search(r"(\d+)(\D+)(\d+)","123abc345").group(1) # '123'
re.search(r"(\d+)(\D+)(\d+)","123abc345").group(2) # 'abc'
re.search(r"(\d+)(\D+)(\d+)","123abc345").group(3) # '345'
{m}
匹配次数
re.search(r"\d{2}","123df") # <re.Match object; span=(0, 2), match='12'>
{m,n}
匹配m - n 次
re.search(r"\d{2,5}","1234567df") # <re.Match object; span=(0, 5), match='12345'>
+
匹配1个或多个
re.match(r"\d+","1234") # <re.Match object; span=(0, 4), match='1234'>
re.match(r"\d+?","1234") # <re.Match object; span=(0, 1), match='1'>
*
匹配0个或多个
re.match(r"\d*","1234") # <re.Match object; span=(0, 4), match='1234'>
re.match(r"\d*","abc") # <re.Match object; span=(0, 0), match=''> 0个也能匹配,匹配到空
?
抑制贪婪性,最小匹配0个或1个
re.match(r"\d*?","a1234") # <re.Match object; span=(0, 0), match=''> 匹配到空
re.match(r"\d*?","1234") # <re.Match object; span=(0, 0), match=''> 匹配到空
re.search(r"\d?","a7").group() # '' 匹配到空
\b
边界
re.findall(r"\b[A-Z]+[a-z]*|[a-z]+\b","ABaaaabAA abc ABC") # ['ABaaaab', 'abc', 'ABC']
|
或的意思
re.findall(r"[A-Z]+[a-z]+|[a-z]+","ABaaaabAA abc ABC") # ['ABaaaab', 'abc']
re.findall(r"[A-Z]+[a-z]*|[a-z]+","ABaaaabAA abc ABC") # ['ABaaaab', 'AA', 'abc', 'ABC']
re.findall(r"\b[A-Z]+[a-z]*|[a-z]+\b","ABaaaabAA abc ABC") # ['ABaaaab', 'abc', 'ABC']
re.S = re.DOTALL
表示匹配全部字符
re.match(r"\w+","abc\n 123",re.DOTALL) # <re.Match object; span=(0, 3), match='abc'>
re.I
表示忽略大小写
re.match(r"abc","ABc",re.I) # <re.Match object; span=(0, 3), match='ABc'>
re.M
表示按行取
re.search(r"[a-z]$","a\nb\nc\n") # <re.Match object; span=(4, 5), match='c'>
re.search(r"[a-z]$","a\nb\nc\n",re.M) # <re.Match object; span=(0, 1), match='a'>
re.findall(r"[a-z]$","a\nb\nc\n",re.M) # ['a', 'b', 'c']
前向肯定和后项肯定
re.search(r"((?<=abc)\d+)","abc123deb") # <re.Match object; span=(3, 6), match='123'>
re.search(r"(\d+(?=abc))","abc123abc") # <re.Match object; span=(3, 6), match='123'>
re.search(r"((?<=abc)\d+(?=abc))","abc123abc") # <re.Match object; span=(3, 6), match='123'>
前项否定和后项否定
re.search(r"(?<!abc)\d+","abc123def") # <re.Match object; span=(4, 6), match='23'>
re.search(r"\d+(?!abc)","abc123def") # <re.Match object; span=(3, 6), match='123'>
练习
"abc123abc"字符串中匹配123
re.search(r"\d+","abc123abc") # <re.Match object; span=(3, 6), match='123'>
re.search(r"\d+","abc123abc").group() # '123'
re.search(r"\d{2}","abc123abc").group() # '12'
“1a12b123c1234d” 找出字符串中数字连续最长的数字
re.findall(r"\d+","1a12b123c1234d") # ['1', '12', '123', '1234']
result = ""
for i in re.findall(r"\d+","1a12b123c1234d"):
if len(i) > len(result):
result = i
print(result) # 1234
本文深入解析正则表达式的概念、语法与应用场景,通过实例演示如何使用Python的re模块进行字符串匹配、替换、分割等操作,适用于初学者及进阶用户。
4376

被折叠的 条评论
为什么被折叠?



