Python 中英文提取
#coding=utf8
import os,re
with open('dianxi.txt', mode='r', encoding='UTF-8') as f: # 打开文件
data = f.read() # 读取文件
s = re.findall('[a-zA-Z0-9\s]', data)# 匹配所有英文+数字+保留空格
kw = ("".join(s))
with open('dianxi-英文.txt', mode='w', encoding='UTF-8') as fw: # 打开文件
fw.write(kw) #写文件
or
#coding=utf8
import os,re
with open('xiaoye.txt', mode='r', encoding='UTF-8') as f: # 打开文件
data = f.read() # 读取文件
s = re.findall('[\u4e00-\u9fa5\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]', data)# 匹配所有汉字+标点
kw = ("".join(s))
with open('dianxi-中文.txt', mode='w', encoding='UTF-8') as fw: # 打开文件
fw.write(kw) #写文件
or
#coding=utf8
import os,re
with open('liziqi.txt', mode='r', encoding='UTF-8') as f: # 打开文件
data = f.read() # 读取文件
s = re.findall('[\u4e00-\u9fa5]', data)# 匹配所有汉字
kw = ("".join(s))
with open('temp.txt', mode='w', encoding='UTF-8') as fw: # 打开文件
fw.write(kw) #写文件
or
#coding=utf8
import os,re
with open('liziqi.txt', mode='r', encoding='UTF-8') as f: # 打开文件
data = f.read() # 读取文件
s = re.findall('[a-zA-Z0-9]', data)# 匹配所有英文+数字
kw = ("".join(s))
with open('tempenglish.txt', mode='w', encoding='UTF-8') as fw: # 打开文件
fw.write(kw) #写文件
参考文章:
https://www.cnblogs.com/pu369/p/12641828.html
https://blog.csdn.net/qq_28633249/article/details/77686976
https://www.iteye.com/blog/fushengfei-939137