Skip to content

Commit

Permalink
remove dead code
Browse files Browse the repository at this point in the history
  • Loading branch information
lucasg committed Aug 16, 2017
1 parent cad579c commit 81febaf
Showing 1 changed file with 0 additions and 227 deletions.
227 changes: 0 additions & 227 deletions posh-to-dash.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,233 +126,6 @@ def download_textfile(url : str , output_filename : str, params : dict = None):
r = requests.get(url, data = params)
with open(output_filename, 'w', encoding="utf8") as f:
f.write(r.text)


def download_as_browser(url, output_filename):
global global_driver
#driver = webdriver.PhantomJS(executable_path="C:\\Users\\lucas\\AppData\\Roaming\\npm\\node_modules\\phantomjs-prebuilt\\lib\\phantom\\bin\phantomjs.exe")
global_driver.get(url)

#soupFromJokesCC = BeautifulSoup(driver.page_source) #page_source fetches page after rendering is complete
with open(output_filename, 'w', encoding="utf8") as f:
f.write(global_driver.page_source)

#global_driver.save_screenshot(output_filename+'screen.png') # save a screenshot to disk



def download_and_fix_links(url, output_filepath, posh_version = Configuration.posh_version, is_index = False, documents_folder = None):
""" Download and fix broken nav paths for modules index """
global global_driver
# r = requests.get(url)
# index_html = r.text
#driver = webdriver.PhantomJS(executable_path="C:\\Users\\lucas\\AppData\\Roaming\\npm\\node_modules\\phantomjs-prebuilt\\lib\\phantom\\bin\phantomjs.exe")
try:
global_driver.get(url)
index_html = global_driver.page_source
except (ConnectionResetError, urllib.error.URLError) as e:
# we may have a triggered a anti-scraping time ban
# Lay low for several seconds and get back to it.

global_driver.quit()
global_driver = webdriver.PhantomJS(executable_path="C:\\Users\\lucas\\AppData\\Roaming\\npm\\node_modules\\phantomjs-prebuilt\\lib\\phantom\\bin\phantomjs.exe")
time.sleep(2)
index_html = None

# try a second time, and raise error if fail
if not index_html:
global_driver.get(url)
index_html = global_driver.page_source


soup = bs(index_html, 'html.parser')


links = soup.findAll("a", { "data-linktype" : "relative-path"}) # for modules and cmdlet pages
if is_index: # for index page
content_table = soup.findAll("table", { "class" : "api-search-results standalone"})[0]
links = content_table.findAll(lambda tag: tag.name == 'a' and 'ms.title' in tag.attrs) # for index page

for link in links:
# search replace <a href="(\w+-\w+)\?view=powershell-6" data-linktype="relative-path">
# <a href="$1.html" data-linktype="relative-path">
if is_index:
link_str_pattern = "([\w\.\/]+)\?view=powershell-"
else:
link_str_pattern = "(\w+-\w+)\?view=powershell-"

link_pattern = re.compile(link_str_pattern)
targets = link_pattern.findall(link['href'])
if not len(targets): # badly formated 'a' link
continue

if is_index:
uri_path = targets[0].lstrip('/').rstrip('/')
fixed_link = soup.new_tag("a", href="%s/index.html" % (uri_path), **{ "ms.title" : link["ms.title"]})
else:
fixed_link = soup.new_tag("a", href="%s.html" % targets[0], **{ "data-linktype" : "relative-path"})

print(link['href'], " -> ", fixed_link['href'])
fixed_link.string = link.string
link.replaceWith(fixed_link)

# remove unsupported nav elements
nav_elements = [
["nav", { "class" : "doc-outline", "role" : "navigation"}],
["ul", { "class" : "breadcrumbs", "role" : "navigation"}],
["div", { "class" : "sidebar", "role" : "navigation"}],
["div", { "class" : "dropdown dropdown-full mobilenavi"}],
["p", { "class" : "api-browser-description"}],
["div", { "class" : "api-browser-search-field-container"}],
["div", { "class" : "pageActions"}],
["div", { "class" : "dropdown-container"}],
]

for nav in nav_elements:
nav_class, nav_attr = nav

for nav_tag in soup.findAll(nav_class, nav_attr):
_ = nav_tag.extract()

# Fix themes uri paths
soup = crawl_posh_themes(documents_folder, soup, output_filepath)

# Export fixed html
with open(output_filepath, 'wb') as o_index:

fixed_html = soup.prettify("utf-8")
o_index.write(fixed_html)

#global_driver.save_screenshot(output_filepath+'screen.png') # save a screenshot to disk
return index_html


def crawl_posh_themes(documents_folder, soup, current_filepath):

theme_output_dir = os.path.join(documents_folder, domain)

# downloading stylesheets
for link in soup.findAll("link", { "rel" : "stylesheet"}):
uri_path = link['href'].strip()

if uri_path.lstrip('/').startswith(default_theme_uri):

css_url = "https://%s/%s" % (domain, uri_path)
css_filepath = os.path.join(theme_output_dir, uri_path.lstrip('/'))

os.makedirs(os.path.dirname(css_filepath), exist_ok = True)

# do not download twice the same file
if not os.path.exists(css_filepath):
download_textfile(css_url, css_filepath)

# fix source map css
# $hex_encoded_id.$name.css -> $name.css
css_filename = os.path.basename(uri_path)
css_dirname = os.path.dirname(css_filepath)

r = re.compile("\w+\.([\w\.]+)")
sourcemap_css_filename = r.match(css_filename).groups()[0]
download_textfile(css_url, os.path.join(css_dirname, sourcemap_css_filename))

# Converting to a relative link
path = os.path.relpath(css_filepath, os.path.dirname(current_filepath))
rel_uri = '/'.join(path.split(os.sep))
link['href'] = rel_uri

# downloading scripts
for script in soup.findAll("script", {"src":True}):
uri_path = script['src']

if uri_path.lstrip('/').startswith(default_theme_uri):

script_url = "https://%s/%s" % (domain, uri_path)

# path normalization : we can do better
script_path = uri_path.lstrip('/')
if -1 != script_path.find('?v='):
script_path = script_path[0:script_path.find('?v=')]

script_filepath = os.path.join(theme_output_dir, script_path)
os.makedirs(os.path.dirname(script_filepath), exist_ok = True)

# do not download twice the same file
if not os.path.exists(script_filepath):
download_textfile(script_url, script_filepath)

# Converting to a relative link
path = os.path.relpath(script_filepath, current_filepath)
rel_uri = '/'.join(path.split(os.sep))
script['src'] = rel_uri

return soup


def crawl_posh_documentation(documents_folder, powershell_version = Configuration.posh_version):
""" Crawl and download Posh modules documentation """

index = default_url % powershell_version
modules_toc = default_toc % (powershell_version, powershell_version)

index_filepath = os.path.join(documents_folder, domain, "en-us", "index.html")
download_and_fix_links(index, index_filepath, is_index= True, posh_version = powershell_version, documents_folder = documents_folder)

modules_filepath = os.path.join(documents_folder, "modules.toc")
download_textfile(modules_toc, modules_filepath)

theme_output_dir = os.path.join(documents_folder, domain, default_theme_uri)
os.makedirs(theme_output_dir, exist_ok = True)

with open(modules_filepath, 'r') as modules_fd:
modules = json.load(modules_fd)

for module in modules['items'][0]['children']:

module_url = urllib.parse.urljoin(modules_toc, module["href"])
module_url = "%s/?view=powershell-%s" % (module_url, powershell_version)

module_dir = os.path.join(documents_folder, base_url, module['toc_title'])
module_filepath = os.path.join(module_dir, "index.html")
os.makedirs(module_dir, exist_ok = True)

logging.debug("downloading modules doc %s -> %s" %(module_url, module_filepath))
mod_html = download_and_fix_links(module_url, module_filepath, posh_version = powershell_version, documents_folder = documents_folder)


for cmdlet in module['children']:
cmdlet_name = cmdlet['toc_title']

if cmdlet_name.lower() in ("about", "functions", "providers", "provider"): # skip special toc
continue

logging.debug("cmdlet %s" % cmdlet)

cmdlet_urlpath = cmdlet["href"]
cmdlet_url = urllib.parse.urljoin(modules_toc, cmdlet_urlpath)
cmdlet_url = "%s?view=powershell-%s" % (cmdlet_url, powershell_version)

cmdlet_filepath = os.path.join(module_dir, "%s.html" % cmdlet_name)

logging.debug("downloading cmdlet doc %s -> %s" %(cmdlet_url, cmdlet_filepath))
cmdlet_html = download_and_fix_links(cmdlet_url, cmdlet_filepath, posh_version = powershell_version, documents_folder = documents_folder)


def insert_into_sqlite_db(cursor, name, record_type, path):
""" Insert a new unique record in the sqlite database. """
try:
cursor.execute('SELECT rowid FROM searchIndex WHERE path = ?', (path,))
dbpath = cursor.fetchone()
cursor.execute('SELECT rowid FROM searchIndex WHERE name = ?', (name,))
dbname = cursor.fetchone()

if dbpath is None and dbname is None:
cursor.execute('INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES (?,?,?)', (name, record_type, path))
logging.debug('DB add [%s] >> name: %s, path: %s' % (record_type, name, path))
else:
logging.debug('record exists')

except:
pass


def make_docset(source_dir, dst_dir, filename):
Expand Down

0 comments on commit 81febaf

Please sign in to comment.