⽤python做含有中⽂的正则表达式模式匹配#!/usr/bin/python
#-*- coding:gbk-*-
'''
spec:根据是否命中126W⼈名,将usrdict分为两个部分
parms:
[IN]
[IN]
[OUT]
author: liuyusi0121@sogou-inc date 20120808
'''
import re;
import sys;
def LoadKeys(filename):
'''
加载key到内存
'''
keys=[];
p=repile('^\s+|\s+$');
fid=file(filename,"r");
adlines();
fid.close();
for line in temp:
line=p.sub('',line);
keys.append(line);
return keys;
def PrintUsage():
print'program [IN] [ [OUT] [OUT] [OUT] ufuwfoverflow';
exit(1);
if(__name__=="__main__"):
delim="\t";
p=repile("(^\\s+|\\s+$)");
if(len(sys.argv)!=6):
PrintUsage();
keyfile=str(sys.argv[1]);
keys=LoadKeys(keyfile);
print len(keys);
inputfile=str(sys.argv[2]);
outputfile1=str(sys.argv[3]);
outputfile2=str(sys.argv[4]);
outputfile3=str(sys.argv[5]);
fout1=open(outputfile1,'w');
fout2=open(outputfile2,'w');
fout3=open(outputfile3,'w');
fid=open(inputfile,"r");
linecount=0;
while True:
adline();
flag=0;
if(0==len(line)):
break;
line=p.sub('',line);
if(''==line):
continue;
if(0==linecount%100000):
print'语料已经处理%d⾏'%linecount;
linecount=linecount+1;
python正则表达式判断
linesegs=line.split("\t");
if(4!=len(linesegs)):
continue;
if(int(linesegs[2])<=0 or int(linesegs[3])<=0):
fout3.write(line);
fout3.write("\n");
continue;
try:
useg=unicode(linesegs[0],'gbk');
count=0;
for key in keys:
if(0==count%100000):
print'模式已经扫描%d个'%count;
count=count+1;
patternstr="(^"+key+"|"+key+"$)";
try:
upatternstr=unicode(patternstr,"gbk");                    pattern=repile(upatternstr);
if(pattern.search(useg)):
print line;
flag=1;
linesegs.append(key)
newline=delim.join(linesegs);
fout1.write(newline);
fout1.write("\n");
break;
except UnicodeDecodeError:
pass;
except:
pass;
if(flag==0):
linesegs.append("_");
newline=delim.join(linesegs);
fout2.write(newline);
fout2.write("\n");
fid.close();
fout1.close();
fout2.close();
fout3.close();