-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathTextUtil.py
executable file
·145 lines (122 loc) · 4.34 KB
/
TextUtil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python3
# Copyright (c) 2008-11 Qtrac Ltd. All rights reserved.
# This program or module is free software: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version. It is provided for educational
# purposes and is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
"""
This module provides a few string manipulation functions.
>>> is_balanced("(Python (is (not (lisp))))")
True
>>> shorten("The Crossing", 10)
'The Cro...'
>>> simplify(" some text with spurious whitespace ")
'some text with spurious whitespace'
"""
import string
def is_balanced(text, brackets="()[]{}<>"):
"""Returns True if all the brackets in the text are balanced
For each pair of brackets, the left and right bracket characters
must be different.
>>> is_balanced("no brackets at all")
True
>>> is_balanced("<b>bold</b>")
True
>>> is_balanced("[<b>(some {thing}) goes</b>]")
True
>>> is_balanced("<b>[not (where {it}) is}]</b>")
False
>>> is_balanced("(not (<tag>(like) (anything)</tag>)")
False
"""
counts = {}
left_for_right = {}
for left, right in zip(brackets[::2], brackets[1::2]):
assert left != right, "the bracket characters must differ"
counts[left] = 0
left_for_right[right] = left
for c in text:
if c in counts:
counts[c] += 1
elif c in left_for_right:
left = left_for_right[c]
if counts[left] == 0:
return False
counts[left] -= 1
return not any(counts.values())
def shorten(text, length=25, indicator="..."):
"""Returns text or a truncated copy with the indicator added
text is any string; length is the maximum length of the returned
string (including any indicator); indicator is the string added at
the end to indicate that the text has been shortened
>>> shorten("Second Variety")
'Second Variety'
>>> shorten("Voices from the Street", 17)
'Voices from th...'
>>> shorten("Radio Free Albemuth", 10, "*")
'Radio Fre*'
"""
if len(text) > length:
text = text[:length - len(indicator)] + indicator
return text
def simplify(text, whitespace=string.whitespace, delete=""):
r"""Returns the text with multiple spaces reduced to single spaces
The whitespace parameter is a string of characters, each of which
is considered to be a space.
If delete is not empty it should be a string, in which case any
characters in the delete string are excluded from the resultant
string.
>>> simplify(" this and\n that\t too")
'this and that too'
>>> simplify(" Washington D.C.\n")
'Washington D.C.'
>>> simplify(" Washington D.C.\n", delete=",;:.")
'Washington DC'
>>> simplify(" disemvoweled ", delete="aeiou")
'dsmvwld'
"""
result = []
word = ""
for char in text:
if char in delete:
continue
elif char in whitespace:
if word:
result.append(word)
word = ""
else:
word += char
if word:
result.append(word)
return " ".join(result)
def insert_at(string, position, insert):
"""Returns a copy of string with insert inserted at the position
>>> string = "ABCDE"
>>> result = []
>>> for i in range(-2, len(string) + 2):
... result.append(insert_at(string, i, "-"))
>>> result[:5]
['ABC-DE', 'ABCD-E', '-ABCDE', 'A-BCDE', 'AB-CDE']
>>> result[5:]
['ABC-DE', 'ABCD-E', 'ABCDE-', 'ABCDE-']
"""
return string[:position] + insert + string[position:]
def dummy_insert_at(string, position, insert):
"""Returns a copy of string with insert inserted at the position
>>> string = "ABCDE"
>>> result = []
>>> for i in range(-2, len(string) + 2):
... result.append(insert_at(string, i, "-"))
>>> result[:5]
['ABC-DE', 'ABCD-E', '-ABCDE', 'A-BCDE', 'AB-CDE']
>>> result[5:]
['ABC-DE', 'ABCD-E', 'ABCDE-', 'ABCDE-']
"""
return string
if __name__ == "__main__":
import doctest
doctest.testmod()