update domain blacklist + add more stats

chiphuyen · Feb 28, 2019 · 56e819f · 56e819f
1 parent 8963870
commit 56e819f
Show file tree

Hide file tree

Showing 9 changed files with 599 additions and 409 deletions.
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ pip3 install -r requirements.txt
 
 3. Install the library
 ``
-python3 setup.py install
+pip3 install .
 ``
 
 If you want to uninstall the library, use:
@@ -43,9 +43,9 @@ There are about 23M URLs from between 2015-06 to 2018-10, of which around 40 - 6
 It means that after you've downloaded and cleaned all good URLs from this, you should have approx 10M webpages or 50GB of pure text.
 
 #### Gutenberg
-You can download the list of all URLs to US Gutenberg books [here](). There are 50K books, which convert to about 14GB of pure text.
+You can download the list of all URLs to US Gutenberg books [here](https://drive.google.com/file/d/1zIVaRaVqGP8VNBUT4eKAzW3gYWxNk728/view?usp=sharing). There are 50K books, which convert to about 14GB of pure text.
 
-You can also run ``lazynlp.get_us_gutenberg_links()`` to get the same list. For example, if you want to get all the Gutenberg URLs and store it in the file ``us_gutenberg.urls``:
+You can also run ``lazynlp.get_us_gutenberg_links()`` to get the same list. For example, if you want to get all the Gutenberg URLs and store it in the file ``us_gutenberg.urls``, run the following command. This might take half a day.
 
 ``
 lazynlp.get_us_gutenberg_links('us_gutenberg.urls')
@@ -82,6 +82,14 @@ This function allows you to deduplicate a new file against all previously dedupl
 
 ### Step 3. Download the URLs
 
+If you want to download each webpage separately, call:
+
+``
+lazynlp.download_page(link, context=None, timeout=None)
+``
+
+If you want to download from a file that contains a list of URLs, call:
+
 ``
 lazynlp.download_pages(link_file, folder, timeout=30, default_skip=True, extensions=[], domains=[])
 ``
@@ -103,11 +111,11 @@ lazynlp.download_pages(link_file, folder, timeout=30, default_skip=True, extensi
 
 	default_skip:
 
-		set to True if you want to automatically skip all URLs that contain domains and extensions that are known to be scraper-unfriendly.
+		set to True if you want to automatically skip all URLs that contain domains and extensions that are known to be scraper-unfriendly or NSFW.
 
-	You can see the list of excluded domains at lazynlp/exclude_domains.txt.
+		You can see the list of excluded domains at lazynlp/exclude_domains.txt.
 
-	You can see the list of excluded extensions at lazynlp/exclude_extensions.txt
+		You can see the list of excluded extensions at lazynlp/exclude_extensions.txt
 
 	You can also add your own domains and extensions to skip with domains and extensions and arguments.
 
@@ -130,25 +138,21 @@ lazynlp.download_pages(link_file, folder, timeout=30, default_skip=True, extensi
 If you have a lot of URLs, you can divide the list into multiple files and call this function separately. I was able to run 40 scripts in parallel.
 I guess I could have parallizing the code. I just found this to be easier.
 
-If you want to download each webpage separately, call:
-
-``
-lazynlp.download_page(link, ctx=None, timeout=None)
-``
 
 ### Step 4. Clean the webpages
+
 You can get rid of all HTML tags, decode utf-8 into string, transliterate foreign characters, collapse white space, replace unprintable characters, unescape HTML, etc. using methods available in lazynlp/cleaner.py.
 
-You can also just call
+You can also just call the following function to do most of the processing.
 
 ``
 lazynlp.clean_page(page)
 ``
 
-to do most of it.
 
-Note:
-In this library, the function lazynlp.download_pages() does both the crawling and cleaning part, so the webpages you have are pure text, like this:
+#### Note:
+
+In this library, the function ``lazynlp.download_pages()`` does both the crawling and cleaning part, so the webpages you have are pure text, like this:
 
 ```
 http://www.thecannabist.co/2017/03/02/jeff-sessions-russia-resign-democrats/74687/
@@ -209,7 +213,7 @@ Names of all the files that are deemed duplicated are stored in ``dupped_files.l
 
 Names of all the files used for the dataset are stored in ``clean_files.list``
 
-Some statistics to keep in mind:
+## Some notes:
 1. 1GB of text is about 1b characters. An English word has on average 4.5 characters, or 5.5 including whitespace.
 So 1GB of text is about 181M words.
 

diff --git a/lazynlp/analytics.py b/lazynlp/analytics.py
@@ -1,5 +1,6 @@
 import os
 import random
+import statistics
 import time
 
 from pybloom import BloomFilter
@@ -8,143 +9,171 @@
 from lazynlp.utils import *
 
 def build_ngram_from_tokens(tokens, n):
-	""" Create a dictionary of n-gram from the list of tokens
-	"""
-	count = {}
-	curr = tokens[:n]
-	count[' '.join(curr)] = 1
-	for token in tokens[n:]:
-		curr = curr[1:] + [token]
-		string = ' '.join(curr)
-		if not string in count:
-			count[string] = 0
-		count[string] += 1
-	return count
+    """ Create a dictionary of n-gram from the list of tokens
+    """
+    count = {}
+    curr = tokens[:n]
+    count[' '.join(curr)] = 1
+    for token in tokens[n:]:
+        curr = curr[1:] + [token]
+        string = ' '.join(curr)
+        if not string in count:
+            count[string] = 0
+        count[string] += 1
+    return count
 
 def build_ngram(file, outfile=None, bf=None, gran='word', n=10, uncase=True, alphanumeric=True, interval=100000):
-	"""
-	gran: granularity of the token. It can be 'word' or 'char'
-	bf: BloomFilter to update the existence of n-grams. Use when the file is too large to store a dictionary count
-	alphanumeric: whether to keep only alphanumeric characters and space.
-	outfile: if outfile is specified, build dictionary of n-grams and write it to outfile
-	interval: how often to report the progress.
-	"""
-	if not gran in set(['word', 'char']):
-		raise ValueError("gran has to be 'word' or 'char'")
-	count = {}
-	f = open(file, 'r')
-	i = 1
-	line = f.readline()
-	start = time.time()
-	while line:
-		line = line.strip()
-		if line:
-			if uncase:
-				line = line.lower()
-			
-			if gran == 'word':
-				if alphanumeric:
-					line = remove_non_alphanumeric(line)
-			else:
-				line = remove_non_alpha(line)
-			line = collapse_white_spaces(line)
-			tokens = line.split()
-			line_count = build_ngram_from_tokens(tokens, n)
-
-			if outfile:
-				count.update()
-			
-			if not bf is None:
-				for key in line_count:
-					bf.add(key)
-
-			if interval > 0 and i % interval == 0:
-				print('Process line: {}. Time: {}'.format(i, time.time() - start))
-				start = time.time()
-
-			i += 1
-
-		line = f.readline()
-
-	f.close()
-
-	if outfile:
-		outfold = outfile[:outfile.rfind('/')]
-		os.makedirs(outfold, exist_ok=True)
-		dict_sorted_2_file(count, os.path.join(outfile.format(n)))
-
-	if bf:
-		return bf
-
-	return count
+    """
+    gran: granularity of the token. It can be 'word' or 'char'
+    bf: BloomFilter to update the existence of n-grams. Use when the file is too large to store a dictionary count
+    alphanumeric: whether to keep only alphanumeric characters and space.
+    outfile: if outfile is specified, build dictionary of n-grams and write it to outfile
+    interval: how often to report the progress.
+    """
+    if not gran in set(['word', 'char']):
+        raise ValueError("gran has to be 'word' or 'char'")
+    count = {}
+    f = open(file, 'r')
+    i = 1
+    line = f.readline()
+    start = time.time()
+    while line:
+        line = line.strip()
+        if line:
+            if uncase:
+                line = line.lower()
+            
+            if gran == 'word':
+                if alphanumeric:
+                    line = remove_non_alphanumeric(line)
+            else:
+                line = remove_non_alpha(line)
+            line = collapse_white_spaces(line)
+            tokens = line.split()
+            line_count = build_ngram_from_tokens(tokens, n)
+
+            if outfile:
+                count.update()
+            
+            if not bf is None:
+                for key in line_count:
+                    bf.add(key)
+
+            if interval > 0 and i % interval == 0:
+                print('Process line: {}. Time: {}'.format(i, time.time() - start))
+                start = time.time()
+
+            i += 1
+
+        line = f.readline()
+
+    f.close()
+
+    if outfile:
+        outfold = outfile[:outfile.rfind('/')]
+        os.makedirs(outfold, exist_ok=True)
+        dict_sorted_2_file(count, os.path.join(outfile.format(n)))
+
+    if bf:
+        return bf
+
+    return count
 
 def build_word_ngram(file, outfile, n=10, alphanumeric=True, norm=True, interval=100000):
-	""" Build word ngrams and store in outfile
-	n-grams in the format:
-	[n-gram][tab][count]
+    """ Build word ngrams and store in outfile
+    n-grams in the format:
+    [n-gram][tab][count]
 
-	If alphanumeric, exclude all the words that contain non-alphanumeric characters
-	"""
-	return build_ngram(file, outfile=outfile, n=n, gran='word', alphanumeric=alphanumeric, norm=norm, interval=interval)
+    If alphanumeric, exclude all the words that contain non-alphanumeric characters
+    """
+    return build_ngram(file, outfile=outfile, n=n, gran='word', alphanumeric=alphanumeric, norm=norm, interval=interval)
 
 def build_char_ngram(file, outfile, n=10, interval=100000):
-	"""
-	Build character n-grams and store in outfile
-	"""
-	return build_ngram(file, outfile=outfile, n=n, gran='char', interval=interval)
+    """
+    Build character n-grams and store in outfile
+    """
+    return build_ngram(file, outfile=outfile, n=n, gran='char', interval=interval)
 
 def estimate_overlap(source_files, target_files, gran='word', n=8, capacity=10000, error_rate=1e-5, header=0, interval=100000):
-	""" Estimate overlapping of target_files with source_files using n-grams
-	gran: granularity of the token. It can be 'word' or 'char'
-	header: number of lines of each file to skip. It's because in our format, the first line is the url
-	"""
-	if not gran in set(['word', 'char']):
-		raise ValueError("gran has to be 'word' or 'char'")
-	if isinstance(source_files, str):
-		source_files = [source_files]
-	if isinstance(target_files, str):
-		target_files = [target_files]
-	
-	bf = BloomFilter(capacity=capacity, error_rate=error_rate)
-	for source_file in source_files:
-		bf = build_ngram(file=source_file, bf=bf, gran=gran, n=n, uncase=True, alphanumeric=True, interval=interval)
-
-	results = []
-	for file in target_files:
-		print(file)
-		results.append(estimate_overlap_bf(bf, file, gran=gran, n=8, header=header))
-	return results
+    """ Estimate overlapping of target_files with source_files using n-grams
+    gran: granularity of the token. It can be 'word' or 'char'
+    header: number of lines of each file to skip. It's because in our format, the first line is the url
+    """
+    if not gran in set(['word', 'char']):
+        raise ValueError("gran has to be 'word' or 'char'")
+    if isinstance(source_files, str):
+        source_files = [source_files]
+    if isinstance(target_files, str):
+        target_files = [target_files]
+    
+    bf = BloomFilter(capacity=capacity, error_rate=error_rate)
+    for source_file in source_files:
+        bf = build_ngram(file=source_file, bf=bf, gran=gran, n=n, uncase=True, alphanumeric=True, interval=interval)
+
+    results = []
+    for file in target_files:
+        print(file)
+        results.append(estimate_overlap_bf(bf, file, gran=gran, n=8, header=header))
+    return results
 
 def estimate_overlap_bf(bf, target_file, gran='word', n=8, header=0):
-	""" Estimate overlapping of target_file with an existing bloomfilter
-	gran: granularity of the token. It can be 'word' or 'char'
-	"""
-	if not gran in set(['word', 'char']):
-		raise ValueError("gran has to be 'word' or 'char'")
-
-	f = open(target_file, 'r')
-	for _ in range(header + 1):
-		line = f.readline()
-
-	total, seen = 0, 0
-	while line:
-		line = line.strip().lower()
-
-		if gran == 'word':
-			line = remove_non_alphanumeric(line)
-		else:
-			line = remove_non_alpha(line)
-		line = collapse_white_spaces(line)
-		tokens = line.split()
-		line_count = build_ngram_from_tokens(tokens, n)
-
-		for key in line_count:
-			if key in bf:
-				seen += 1
-			total += 1
-
-		line = f.readline()
-
-	result = seen / total
-	print('{} seen out of {}: {}'.format(seen, total, result))
-	return result
+    """ Estimate overlapping of target_file with an existing bloomfilter
+    gran: granularity of the token. It can be 'word' or 'char'
+    """
+    if not gran in set(['word', 'char']):
+        raise ValueError("gran has to be 'word' or 'char'")
+
+    f = open(target_file, 'r')
+    for _ in range(header + 1):
+        line = f.readline()
+
+    total, seen = 0, 0
+    while line:
+        line = line.strip().lower()
+
+        if gran == 'word':
+            line = remove_non_alphanumeric(line)
+        else:
+            line = remove_non_alpha(line)
+        line = collapse_white_spaces(line)
+        tokens = line.split()
+        line_count = build_ngram_from_tokens(tokens, n)
+
+        for key in line_count:
+            if key in bf:
+                seen += 1
+            total += 1
+
+        line = f.readline()
+
+    result = seen / total
+    print('{} seen out of {}: {}'.format(seen, total, result))
+    return result
+
+def file_stats(file):
+    """ Return statistics about line lengths and average character per words
+    """
+    line_lengths, token_lengths = [], []
+    with open(file, 'r') as f:
+        line = f.readline()
+        while line:
+            tokens = line.split()
+            line_lengths.append(len(tokens))
+            line_token_lengths = [len(token) for token in tokens]
+            token_lengths.append([len(tokens), sum(line_token_lengths) / len(tokens)])
+            line = f.readline()
+
+    total_tokens = sum([pair[0] for pair in token_lengths])
+    total_chars = sum([pair[0] * pair[1] for pair in token_lengths])
+    average_chars = total_chars / total_tokens
+    print("Character per word: average = {}.".format(average_chars))
+
+    report = "Word count per line: average = {}, median = {}, max = {}, min = {}, stddev = {}."
+    print(report.format(statistics.mean(line_lengths), statistics.median(line_lengths), 
+                        max(line_lengths), min(line_lengths),
+                        statistics.stdev(line_lengths)))
+    return statistics.mean(line_lengths), average_chars
+
+def estimate_entropy(file, gran='word', max_n=10):
+    pass
+