|
| 1 | + |
| 2 | +import os, re, codecs, subprocess |
| 3 | +import shutil, stat, errno, sys, operator |
| 4 | + |
| 5 | +import urllib.parse, html |
| 6 | +from lxml import etree |
| 7 | + |
| 8 | + |
| 9 | +ref_path = 'e:/tools/wget/cplusplus_reference/www.cplusplus.com/' |
| 10 | +full_site = False |
| 11 | + |
| 12 | +web_ref_prefix = 'http://www.cplusplus.com/reference/' |
| 13 | +qch_proj_name = 'qch-proj' |
| 14 | + |
| 15 | +ref_dirs = [ |
| 16 | + 'img', |
| 17 | + 'reference', |
| 18 | + 'site', |
| 19 | + 'v321' |
| 20 | +] |
| 21 | + |
| 22 | +QCH_PROJ_TEXT_TMPL = """ |
| 23 | +<?xml version='1.0' encoding='utf-8'?> |
| 24 | +<QtHelpProject version="1.0"> |
| 25 | + <namespace>cplusplus_com</namespace> |
| 26 | + <virtualFolder>cpp</virtualFolder> |
| 27 | + <customFilter name="C++ Reference"> |
| 28 | + <filterAttribute>cplusplus_ref</filterAttribute> |
| 29 | + </customFilter> |
| 30 | + <filterSection> |
| 31 | + <filterAttribute>cplusplus_ref</filterAttribute> |
| 32 | + <toc> |
| 33 | + {0} |
| 34 | + </toc> |
| 35 | + <keywords> |
| 36 | + {1} |
| 37 | + </keywords> |
| 38 | + <files> |
| 39 | + {2} |
| 40 | + </files> |
| 41 | + </filterSection> |
| 42 | +</QtHelpProject> |
| 43 | +""" |
| 44 | + |
| 45 | +MAIN_TOC_FULL = """ |
| 46 | +<section title="cplusplus.com" ref="index.html"> |
| 47 | + <section title="Information" ref="info/index.html" /> |
| 48 | + <section title="Tutorials" ref="doc/index.html" /> |
| 49 | + <section title="Reference" ref="reference/index.html" /> |
| 50 | + <section title="Articles" ref="articles/index.html" /> |
| 51 | +</section> |
| 52 | +""" |
| 53 | + |
| 54 | +MAIN_TOC_REF = """ |
| 55 | +<section title="Standard C++ Library reference" ref="reference/index.html" /> |
| 56 | +""" |
| 57 | + |
| 58 | + |
| 59 | +# --- |
| 60 | +def print_keywords(keywords): |
| 61 | + for key in keywords: |
| 62 | + print(key + '\n :: ' + keywords[key] + '\n') |
| 63 | + |
| 64 | + |
| 65 | +# --- |
| 66 | +def clear_html(fp): |
| 67 | + ext = os.path.splitext(fp)[1] |
| 68 | + if ext != '.html' and ext != '.htm': |
| 69 | + return |
| 70 | + |
| 71 | + # relative path from the reference root |
| 72 | + # and number of folders to go up from the current file directory to the 'reference' root |
| 73 | + rel_fp = fp.replace(ref_path + 'reference/', '') |
| 74 | + updir_count = len(re.findall('/', rel_fp)) |
| 75 | + |
| 76 | + parser = etree.HTMLParser() |
| 77 | + html_doc = etree.parse(fp, parser) |
| 78 | + |
| 79 | + html_changed = False |
| 80 | + |
| 81 | + # remove cookie panel |
| 82 | + search_text = 'consent=cookie' |
| 83 | + scripts = html_doc.xpath('//script') |
| 84 | + for script in scripts: |
| 85 | + text = script.text |
| 86 | + if text and search_text in text: |
| 87 | + script.getparent().remove(script) |
| 88 | + html_changed = True |
| 89 | + |
| 90 | + # remove adds |
| 91 | + c_support_header = html_doc.xpath('//div[@class="C_support"]') |
| 92 | + for m in c_support_header: |
| 93 | + m.getparent().remove(m) |
| 94 | + html_changed = True |
| 95 | + |
| 96 | + # remove adds |
| 97 | + ins_elements = html_doc.xpath('//ins') |
| 98 | + for m in ins_elements: |
| 99 | + m.getparent().remove(m) |
| 100 | + html_changed = True |
| 101 | + |
| 102 | + # fix relative links |
| 103 | + links = html_doc.xpath('//a[contains(@href,"' + web_ref_prefix + '")]') |
| 104 | + for link in links: |
| 105 | + href = link.get("href") |
| 106 | + text = link.xpath('string()') |
| 107 | + |
| 108 | + print('Trying to fix external link:\n {0}\n {1} :: {2}'.format(fp, href, str(text))) |
| 109 | + |
| 110 | + href = href.replace(web_ref_prefix, '') |
| 111 | + |
| 112 | + rel_fp_base = os.path.dirname(rel_fp) + '/' |
| 113 | + rel_fp_base_pos = href.find(rel_fp_base) |
| 114 | + if rel_fp_base_pos == 0: |
| 115 | + href = href.replace(rel_fp_base, '') |
| 116 | + else: |
| 117 | + href = '../'*updir_count + href |
| 118 | + |
| 119 | + link.set("href", href) |
| 120 | + html_changed = True |
| 121 | + |
| 122 | + if html_changed: |
| 123 | + html_doc.write(fp, encoding='utf-8', method='html') |
| 124 | + |
| 125 | + |
| 126 | +# --- |
| 127 | +def fix_js(): |
| 128 | + main_js = ref_path + 'v321/main.js' |
| 129 | + |
| 130 | + f = codecs.open(main_js, 'r', 'utf-8') |
| 131 | + text = f.read() |
| 132 | + f.close() |
| 133 | + |
| 134 | + # set Qt absolute path instead of the default one (the script adds the 'versions' link after content loaded) |
| 135 | + text = text.replace('"/site/versions/"', '"qthelp://cplusplus_com/cpp/site/versions/index.html"') |
| 136 | + |
| 137 | + f = codecs.open(main_js, 'w', 'utf-8') |
| 138 | + f.write(text) |
| 139 | + f.close() |
| 140 | + |
| 141 | + |
| 142 | +# --- |
| 143 | +def collect_keywords(fp, keywords, main_ref=False): |
| 144 | + ext = os.path.splitext(fp)[1] |
| 145 | + if ext != '.html' and ext != '.htm' or not 'reference/' in fp: |
| 146 | + return |
| 147 | + |
| 148 | + parser = etree.HTMLParser() |
| 149 | + html_doc = etree.parse(fp, parser) |
| 150 | + |
| 151 | + links = [] |
| 152 | + if main_ref: |
| 153 | + # only the Reference block links |
| 154 | + links = html_doc.xpath('//div[@id="I_nav"]//div[@id="reference_box"]//a') |
| 155 | + else: |
| 156 | + # submodules links (below Reference block) |
| 157 | + divs = html_doc.xpath('//div[@id="I_nav"]//div') |
| 158 | + for d in divs: |
| 159 | + at_id = d.get('id') |
| 160 | + if at_id and at_id != 'reference_box' and at_id != 'I_subnav': |
| 161 | + links = d.xpath('.//a') |
| 162 | + |
| 163 | + # get link names and addresses |
| 164 | + for link in links: |
| 165 | + href = link.get("href") |
| 166 | + text = link.xpath("string()") |
| 167 | + |
| 168 | + if href == None or href.find('..') != -1 or href == 'index.html' or href.find('http') == 0: |
| 169 | + continue |
| 170 | + |
| 171 | + href = urllib.parse.unquote(href) |
| 172 | + |
| 173 | + text = html.escape(text) |
| 174 | + text = text.strip() |
| 175 | + |
| 176 | + rel_href = os.path.dirname(fp).replace(ref_path, '') + '/' + href |
| 177 | + rel_href = rel_href.replace('\\', '/') |
| 178 | + |
| 179 | + if text in keywords: |
| 180 | + # add parent folder to distinguish keywords and remove the previous keyword |
| 181 | + prev_href = keywords[text] |
| 182 | + temp_href = prev_href.replace('reference/', '') |
| 183 | + kw_spec = os.path.dirname(os.path.dirname(temp_href)) |
| 184 | + text1 = text + ' (' + kw_spec + ')' |
| 185 | + |
| 186 | + temp_href = rel_href.replace('reference/', '') |
| 187 | + kw_spec = os.path.dirname(os.path.dirname(temp_href)) |
| 188 | + text2 = text + ' (' + kw_spec + ')' |
| 189 | + |
| 190 | + del keywords[text] |
| 191 | + keywords[text1] = prev_href |
| 192 | + keywords[text2] = rel_href |
| 193 | + else: |
| 194 | + keywords[text] = rel_href |
| 195 | + |
| 196 | + |
| 197 | +# --- |
| 198 | +def delete_header_keywords(keywords): |
| 199 | + headers = [ |
| 200 | + 'C library:', |
| 201 | + 'Containers:', |
| 202 | + 'Input/Output:', |
| 203 | + 'Other:' |
| 204 | + ] |
| 205 | + |
| 206 | + for h in headers: |
| 207 | + del keywords[h] |
| 208 | + |
| 209 | + |
| 210 | +# --- |
| 211 | +def run(): |
| 212 | + print('Process started') |
| 213 | + print('-- Generating Full site' if full_site else '-- Generating Reference block') |
| 214 | + |
| 215 | + # -- Init |
| 216 | + out_qch_proj = ref_path + qch_proj_name + '.xml' |
| 217 | + |
| 218 | + keywords = {} |
| 219 | + |
| 220 | + toc_ar = [] |
| 221 | + keywords_ar = [] |
| 222 | + files_ar = [] |
| 223 | + |
| 224 | + subsection_indent = ' ' |
| 225 | + |
| 226 | + |
| 227 | + # -- Reference block keywords/links |
| 228 | + main_page = ref_path + 'reference/index.html' |
| 229 | + collect_keywords(main_page, keywords, True) |
| 230 | + delete_header_keywords(keywords) |
| 231 | + |
| 232 | + fix_js() |
| 233 | + |
| 234 | + |
| 235 | + print('Processing files') |
| 236 | + |
| 237 | + # -- Process pages (modify content, collect keywords, collect <file>'s for Qt help project) |
| 238 | + for root, dirs, files in os.walk(ref_path): |
| 239 | + root = root.replace('\\', '/') |
| 240 | + |
| 241 | + # filter dirs for not full site (only reference) |
| 242 | + if not full_site and root != ref_path: |
| 243 | + sub_root_match = re.search(ref_path + '(.+?)/', root) |
| 244 | + sub_root_dir = root.replace(ref_path, '') |
| 245 | + if sub_root_match: |
| 246 | + sub_root_dir = sub_root_match.group(1) |
| 247 | + if not sub_root_dir in ref_dirs: |
| 248 | + continue |
| 249 | + |
| 250 | + for f in files: |
| 251 | + fp = root + '/' + f |
| 252 | + rel_fp = fp.replace(ref_path, '') |
| 253 | + if root == ref_path: |
| 254 | + rel_fp = rel_fp[1:] |
| 255 | + |
| 256 | + rel_fp = html.escape(rel_fp) |
| 257 | + felem = subsection_indent + '<file>' + rel_fp + '</file>' |
| 258 | + files_ar.append(felem) |
| 259 | + |
| 260 | + clear_html(fp) |
| 261 | + collect_keywords(fp, keywords) |
| 262 | + |
| 263 | + |
| 264 | + print('Building project tree') |
| 265 | + |
| 266 | + # -- Fill the project file text template |
| 267 | + qtoc = '' |
| 268 | + qkeywords = '' |
| 269 | + qfiles = '' |
| 270 | + |
| 271 | + main_toc = MAIN_TOC_REF |
| 272 | + if full_site: |
| 273 | + main_toc = MAIN_TOC_FULL |
| 274 | + main_toc = main_toc.strip() |
| 275 | + qtoc = main_toc.replace('\n', '\n ') |
| 276 | + |
| 277 | + for key in keywords: |
| 278 | + kw = subsection_indent + '<keyword name="{0}" id="{0}" ref="{1}"/>'.format(key, keywords[key]) |
| 279 | + qkeywords += kw + '\n' |
| 280 | + qkeywords = qkeywords.strip() |
| 281 | + |
| 282 | + for f in files_ar: |
| 283 | + if f.find('.qch') != -1 or f.find(qch_proj_name) != -1: |
| 284 | + continue |
| 285 | + qfiles += f + '\n' |
| 286 | + qfiles = qfiles.strip() |
| 287 | + |
| 288 | + project_text = QCH_PROJ_TEXT_TMPL.format(qtoc, qkeywords, qfiles) |
| 289 | + project_text = project_text.strip() |
| 290 | + |
| 291 | + print('Writing project file') |
| 292 | + |
| 293 | + # -- Write the project file |
| 294 | + f = codecs.open(out_qch_proj, 'w', 'utf-8') |
| 295 | + f.write(project_text) |
| 296 | + f.close() |
| 297 | + |
| 298 | + print('Process finished') |
| 299 | + |
| 300 | + |
| 301 | +# ------------------- |
| 302 | +run() |
0 commit comments