This repository has been archived by the owner on Oct 12, 2017. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 15
/
StripWhiteChars
114 lines (99 loc) · 3.61 KB
/
StripWhiteChars
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
= Strip white characters among html tags in your pages with CherryPy =
{{{
#!python
import re
class StripWhiteChars(object):
"""
Stripping all "white characters" among html tags.
Author: Jaroslaw Zabiello (http://zabiello.com)
Version: 2004-11-26
"""
omitTags = ['xmp', 'pre', 'script', 'style', 'textarea', 'code']
def __init__(self):
# new object style: http://python.org/doc/newstyle.html
super(StripWhiteChars, self).__init__()
self.mark = []
for tag in self.omitTags:
pattern = '<'+tag+r'[^>]*?>.*?</'+tag+'>'
record = {'tag': tag, 'regex': re.compile(pattern, re.IGNORECASE|re.MULTILINE|re.DOTALL),}
self.mark.append(record)
self.regexStrip = re.compile(r'(<[^>]+?>)[\s]+?(<[^>]+?>)', re.MULTILINE|re.DOTALL)
def strip(self, html):
marked = {}
for dic in self.mark:
found = dic['regex'].findall(html)
for i in xrange(len(found)):
replaced = '@@@STRIPPED%s%d@@@' % (dic['tag'], i)
html = html.replace(found[i], replaced)
marked[replaced] = found[i]
while self.regexStrip.search(html):
html = self.regexStrip.sub(r'\1\2', html)
for replaced, source in marked.iteritems():
html = html.replace(replaced, source)
return html
if __name__=='__main__':
# test:
html = '''
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="pl">
<head>
<title>To jest test</title>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-2" />
<meta name="language" content="pl" />
<meta name="robots" content="all" />
<style type="text/css" media="all">@import "/styles/glowna.css";</style>
<script type="text/javascript" src="/scripts/funkcje.js"> </script>
</head>
<body id>
<div id="accessibility">goto [<a href="#menu">part one</a>] [<a href="#search">part two</a>]</div>
<div id="container">
<script language="JavaScript">
<!--
document.write('<code> to jest test</code>')
//-->
</script> <code>
To jest
jakis kod
</code> <script>
<!--
document.write('to jest script2')
//-->
</script>
<table border=0>
<tr>
<td>
<A HREF="http://python.org">
<font color="red">Python HomePage</font>
</A>
</td>
</tr>
</table>
</div>
</body>
</html>
'''
test = StripWhiteChars()
print test.strip(html)
}}}
It can be added to !CherryPy as filter:
{{{
#!python
from cherrypy import cpg
from cherrypy.lib.filter.basefilter import BaseOutputFilter
class StripWhiteCharsFilter(BaseOutputFilter):
"""Filter stripping all white-chars among html tags"""
def __init__(self):
super(StripWhiteCharsFilter, self).__init__()
self.obj = StripWhiteChars()
def beforeResponse(self):
cpg.response.body = [self.obj.strip(''.join(cpg.response.body))]
testhtml = '<html>.....</html>'
class Root(object):
"""main"""
_cpFilterList = [StripWhiteCharsFilter()]
def index(self, **kwargs):
return testhtml
index.exposed = True
cpg.root = Root()
cpg.server.start(configFile = 'Root.conf')
}}}