gpambrozio · fxthomas · Mar 5, 2012 · Mar 14, 2012
diff --git a/PythonDocs2DocSet/create_docset.py b/PythonDocs2DocSet/create_docset.py
@@ -90,6 +90,7 @@ def find_existing_file(possible):
 modindex_path = find_existing_file([
     "modindex.html",
     "py-modindex.html",
+    "np-modindex.html"
 ])
 
 genindex_path = find_existing_file([
@@ -170,24 +171,21 @@ def find_existing_file(possible):
             apple_ref = "//apple_ref/cpp/cat/%s" % name
             pages[href].append(apple_ref)
 
-
-## Collect pages from the general index
-with codecs.open(os.path.join(source_folder, genindex_path), 'r', encoding="utf-8") as f:
-    for line in f:
-        for search in re.finditer("(<dt>|, )<a href=\"([^#]+).*?\">", line):
-            href = search.group(2)
-            if not href in pages:
-                pages[href] = []
-
-
-## Collect pages from the library index
-if os.path.exists(os.path.join(source_folder, "library/index.html")):
-    with codecs.open(os.path.join(source_folder, "library/index.html"), 'r', encoding="utf-8") as f:
-        for line in f:
-            for search in re.finditer("<a class=\"reference external\" href=\"([^#\"]+).*?\">", line):
-                href = "library/" + search.group(1)
-                if not ("http://" in href or "https://" in href or href in pages):
-                    pages[href] = []
+## Collect remaning HTML pages
+for path,_,files in os.walk ("."):
+  # Clean up path (remove "./")
+  cleanPath = re.sub (r"^./", "", path)
+  if cleanPath == ".":
+    cleanPath = ""
+  if os.path.samefile (path, dest_folder):
+    continue
+
+  # Walk through HTML files
+  for f in files:
+    if re.match (r".*\.html$", f):
+      href = os.path.join (cleanPath, f)
+      if not href in pages:
+        pages[href] = []
 
 with codecs.open(token_path, "w", encoding="utf-8" ) as tokens:
     ## Start of the tokens file
@@ -222,41 +220,45 @@ def find_existing_file(possible):
 
         ## This adds some hidden tags that makes Dash display this page's
         ## TOC on the left side of the screen, just like with iOS and OSX docs
-        toc = soup.find('div', 'sphinxsidebarwrapper').findAll("a", "reference")
-        if len(toc) > 0:
-            toc_tag = soup.new_tag("div", style="display:none;")
-            soup.body.append(toc_tag)
-            a_tag = soup.new_tag("a")
-            a_tag["name"] = "#"
-            toc_tag.append(a_tag)
-            h3_tag = soup.new_tag("h3")
-            h3_tag["class"] = "tasks"
-            h3_tag.append("TOC")
-            toc_tag.append(h3_tag)
-            ul_tag = soup.new_tag("ul")
-            ul_tag["class"] = "tooltip"
-            toc_tag.append(ul_tag)
-
-            for t in toc:
-                li_tag = soup.new_tag("li")
-                li_tag["class"] = "tooltip"
-                ul_tag.append(li_tag)
-                a_tag = soup.new_tag("a")
-                a_tag["href"] = t['href']
-                a_tag.append(t.text)
-                li_tag.append(a_tag)
-
-        if len(names) > 0:
-            tokens.write("<File path=\"%s\">\n" % href)
-            for name in names:
-                tokens.write("\t<Token><TokenIdentifier>%s</TokenIdentifier><Anchor>%s</Anchor></Token>\n" % (name, name))
-            tokens.write("</File>\n")
-
-            newFilePath = os.path.join(dest_folder, href)
-            if not os.path.exists(os.path.dirname(newFilePath)):
-                os.makedirs(os.path.dirname(newFilePath)) # might be a bug...if given something/test.html, it creates test.html as a directory!
-            with codecs.open(newFilePath, "w", encoding="utf-8") as newFile:
-                newFile.write(unicode(soup))
+        tocdiv = soup.find('div', 'sphinxsidebarwrapper')
+        if tocdiv is None:
+          tocdiv = soup.find('div', 'sphinxsidebar')
+
+        if tocdiv is not None:
+          toc = tocdiv.findAll("a", "reference")
+          if len(toc) > 0:
+              toc_tag = soup.new_tag("div", style="display:none;")
+              soup.body.append(toc_tag)
+              a_tag = soup.new_tag("a")
+              a_tag["name"] = "#"
+              toc_tag.append(a_tag)
+              h3_tag = soup.new_tag("h3")
+              h3_tag["class"] = "tasks"
+              h3_tag.append("TOC")
+              toc_tag.append(h3_tag)
+              ul_tag = soup.new_tag("ul")
+              ul_tag["class"] = "tooltip"
+              toc_tag.append(ul_tag)
+
+              for t in toc:
+                  li_tag = soup.new_tag("li")
+                  li_tag["class"] = "tooltip"
+                  ul_tag.append(li_tag)
+                  a_tag = soup.new_tag("a")
+                  a_tag["href"] = t['href']
+                  a_tag.append(t.text)
+                  li_tag.append(a_tag)
+
+        tokens.write("<File path=\"%s\">\n" % href)
+        for name in names:
+            tokens.write("\t<Token><TokenIdentifier>%s</TokenIdentifier><Anchor>%s</Anchor></Token>\n" % (name, name))
+        tokens.write("</File>\n")
+
+        newFilePath = os.path.join(dest_folder, href)
+        if not os.path.exists(os.path.dirname(newFilePath)):
+            os.makedirs(os.path.dirname(newFilePath)) # might be a bug...if given something/test.html, it creates test.html as a directory!
+        with codecs.open(newFilePath, "w", encoding="utf-8") as newFile:
+            newFile.write(unicode(soup))
 
     tokens.write("</Tokens>")
 
@@ -274,3 +276,6 @@ def find_existing_file(possible):
 os.remove(os.path.join(docset_folder, "Contents/Resources/Tokens.xml"))
 
 print("done")
+print("")
+print("You might have to manually add missing references (images, ...) as they are not automatically detected.")
+print("It is also a good practice to remove additional elements, such as headers, sidebars, and so on.")