srlearn · hayesall · Jul 20, 2018 · Jul 20, 2018 · Jul 20, 2018 · Jul 20, 2018
diff --git a/rnlp/__main__.py b/rnlp/__main__.py
@@ -66,6 +66,9 @@
 
 parser.add_argument('-b', '--blockSize', type=int, default=2,
                     help='Set the block size')
+parser.add_argument('-n', '--n_jobs', type=int, default=1,
+                    help='''Set the number of cores to use,
+                    -1 to use all cores. Default=1''')
 file_or_dir.add_argument('-d', '--directory', type=str,
                          help='Read text from all files in a directory.')
 file_or_dir.add_argument('-f', '--file', type=str,
@@ -78,6 +81,10 @@
 n = args.blockSize
 logger.info('blockSize specified as ' + str(n))
 
+# Set the number of jobs to perform in parallel.
+n_jobs = args.n_jobs
+logger.info('n_jobs: ' + str(n_jobs))
+
 # Set the input file(s).
 if args.file:
     chosenFile = args.file
@@ -112,7 +119,7 @@
 
 # Make identifiers from the blocks.
 try:
-    makeIdentifiers(blocks)
+    makeIdentifiers(blocks, n_jobs=n_jobs)
 except Exception:
     logger.error('Error while making identifiers.', exc_info=True)
     exit(2)

diff --git a/rnlp/parse.py b/rnlp/parse.py
@@ -30,7 +30,10 @@
 
 # Non-standard Python Library
 import nltk
+from joblib import Parallel
+from joblib import delayed
 from tqdm import tqdm
+
 from .textprocessing import getSentences
 from .textprocessing import getBlocks
 
@@ -111,14 +114,18 @@ def _writeBk(target="sentenceContainsTarget(+SID,+WID).", treeDepth="3",
     return
 
 
-def makeIdentifiers(blocks, target="sentenceContainsTarget(+SID,+WID).",
+def makeIdentifiers(blocks, n_jobs=1,
+                    target="sentenceContainsTarget(+SID,+WID).",
                     treeDepth="3", nodeSize="3", numOfClauses="8"):
     """
     Make unique identifiers for components of the block and write to files.
 
     :param blocks: Blocks of sentences (likely the output of
                    ``textprocessing.getBlocks``).
     :type blocks: list
+    :param n_jobs: Number of jobs to perform in parallel. -1 to use all
+                   available cores. Default=1.
+    :type n_jobs: int.
     :param target: Target to write to the background file (another option might
                    be ``blockContainsTarget(+BID,+SID).``).
     :type target: str.
@@ -154,125 +161,123 @@ def makeIdentifiers(blocks, target="sentenceContainsTarget(+SID,+WID).",
                     # 100%|██████████████████████| 2/2 [00:00<00:00, 18.49it/s]
     """
 
-    blockID, sentenceID, wordID = 1, 0, 0
-
     print("Creating background file...")
-
     _writeBk(target=target, treeDepth=treeDepth,
              nodeSize=nodeSize, numOfClauses=numOfClauses)
 
     print("Creating identifiers from the blocks...")
 
-    nBlocks = len(blocks)
-    for block in tqdm(blocks):
+    n_jobs = n_jobs
+    fact_blocks = Parallel(n_jobs=n_jobs)(delayed(__makeIdentifiers)(blocks[i], i) for i in tqdm(range(len(blocks))))
 
-        _writeBlock(block, blockID)
+    with open('facts.txt', 'w') as f:
+        for block in fact_blocks:
+            for line in block:
+                f.write(line + '\n')
 
-        sentenceID = 1
-        nSentences = len(block)
-        beginning = nSentences/float(3)
-        ending = (2*nSentences)/float(3)
 
-        for sentence in block:
+def __makeIdentifiers(block, blockID):
+    """
+    Spreads out the process of makingIdentifiers over a set number of cores.
 
-            if sentenceID < nSentences:
-                # mode: nextSentenceInBlock(blockID, sentenceID, sentenceID).
-                ps = "nextSentenceInBlock(" + str(blockID) + "," + \
-                     str(blockID) + "_" + str(sentenceID) + "," + \
-                     str(blockID) + "_" + str(sentenceID+1) + ")."
-                _writeFact(ps)
-
-            if sentenceID < beginning:
-                # mode: earlySentenceInBlock(blockID, sentenceID).
-                ps = "earlySentenceInBlock(" + str(blockID) + "," + \
-                     str(blockID) + "_" + str(sentenceID) + ")."
-                _writeFact(ps)
-            elif sentenceID > ending:
-                # mode: lateSentenceInBlock(blockID, sentenceID).
-                ps = "lateSentenceInBlock(" + str(blockID) + "," + \
-                     str(blockID) + "_" + str(sentenceID) + ")."
-                _writeFact(ps)
-            else:
-                # mode: midWaySentenceInBlock(blockID, sentenceID).
-                ps = "earlySentenceInBlock(" + str(blockID) + "," + \
-                     str(blockID) + "_" + str(sentenceID) + ")."
-                _writeFact(ps)
-
-            # mode: sentenceInBlock(sentenceID, blockID).
-            ps = "sentenceInBlock(" + str(blockID) + "_" + str(sentenceID) + \
-                 "," + str(blockID) + ")."
-            _writeFact(ps)
-            _writeSentenceInBlock(sentence, blockID, sentenceID)
-
-            wordID = 1
-            tokens = nltk.word_tokenize(sentence)
-            nWords = len(tokens)
-            wBeginning = nWords/float(3)
-            wEnding = (2*nWords)/float(3)
-
-            for word in tokens:
-
-                """
-                if word == "He":
-                    pos = open("pos.txt","a")
-                    word = str(blockID)+"_"+str(sentenceID)+"_"+str(wordID)
-                    sentence = str(blockID)+"_"+str(sentenceID)
-                    pos.write("sentenceContainsTarget("+sentence+","+word+").\n")
-                    pos.close()
-                else:
-                    neg = open("neg.txt","a")
-                    word = str(blockID)+"_"+str(sentenceID)+"_"+str(wordID)
-                    sentence = str(blockID)+"_"+str(sentenceID)
-                    neg.write("sentenceContainsTarget("+sentence+","+word+").\n")
-                    neg.close()
-                """
-
-                # mode: wordString(wordID, #str).
-                ps = "wordString(" + str(blockID) + "_" + str(sentenceID) + \
-                     "_" + str(wordID) + "," + "'" + str(word) + "')."
-                _writeFact(ps)
-
-                # mode: partOfSpeechTag(wordID, #POS).
-                POS = nltk.pos_tag([word])[0][1]
-                ps = "partOfSpeech(" + str(blockID) + "_" + str(sentenceID) + \
-                     "_" + str(wordID) + "," + '"' + str(POS) + '").'
-                _writeFact(ps)
-
-                # mode: nextWordInSentence(sentenceID, wordID, wordID).
-                if wordID < nWords:
-                    ps = "nextWordInSentence(" + str(blockID) + "_" + \
-                         str(sentenceID) + "," + str(blockID) + "_" + \
-                         str(sentenceID) + "_" + str(wordID) + "," + \
-                         str(blockID) + "_" + str(sentenceID) + "_" + \
-                         str(wordID+1) + ")."
-                    _writeFact(ps)
-
-                if wordID < wBeginning:
-                    # mode: earlyWordInSentence(sentenceID, wordID).
-                    ps = "earlyWordInSentence(" + str(blockID) + "_" + \
-                         str(sentenceID) + "," + str(blockID) + "_" + \
-                         str(sentenceID) + "_" + str(wordID) + ")."
-                    _writeFact(ps)
-                elif wordID > wEnding:
-                    # mode: lateWordInSentence(sentenceID< wordID).
-                    ps = "lateWordInSentence(" + str(blockID) + "_" + \
-                         str(sentenceID) + "," + str(blockID) + "_" + \
-                         str(sentenceID) + "_" + str(wordID) + ")."
-                    _writeFact(ps)
-                else:
-                    # mode: midWayWordInSentence(sentenceID, wordID).
-                    ps = "midWayWordInSentence(" + str(blockID) + "_" + \
-                         str(sentenceID) + "," + str(blockID) + "_" + \
-                         str(sentenceID) + "_" + str(wordID) + ")."
-                    _writeFact(ps)
-
-                # mode: wordInSentence(wordID, sentenceID).
-                ps = "wordInSentence(" + str(blockID) + "_" + \
+    :returns: A list of facts to be written to files.
+    :rtype: list
+    """
+
+    # Initialize an empty list where the facts will be stored.
+    facts = []
+
+    # Initialize the blockID, sentenceID, and wordID
+    sentenceID, wordID = 0, 0
+
+    sentenceID = 1
+    nSentences = len(block)
+    beginning = nSentences/float(3)
+    ending = (2 * nSentences)/float(3)
+
+    for sentence in block:
+
+        if sentenceID < nSentences:
+            # mode: nextSentenceInBlock(blockID, sentenceID, sentenceID).
+            ps = "nextSentenceInBlock(" + str(blockID) + "," + \
+                 str(blockID) + "_" + str(sentenceID) + "," + \
+                 str(blockID) + "_" + str(sentenceID+1) + ")."
+            facts.append(ps)
+
+        if sentenceID < beginning:
+            # mode: earlySentenceInBlock(blockID, sentenceID).
+            ps = "earlySentenceInBlock(" + str(blockID) + "," + \
+                 str(blockID) + "_" + str(sentenceID) + ")."
+            facts.append(ps)
+        elif sentenceID > ending:
+            # mode: lateSentenceInBlock(blockID, sentenceID).
+            ps = "lateSentenceInBlock(" + str(blockID) + "," + \
+                 str(blockID) + "_" + str(sentenceID) + ")."
+            facts.append(ps)
+        else:
+            # mode: midWaySentenceInBlock(blockID, sentenceID).
+            ps = "earlySentenceInBlock(" + str(blockID) + "," + \
+                 str(blockID) + "_" + str(sentenceID) + ")."
+            facts.append(ps)
+
+        # mode: sentenceInBlock(sentenceID, blockID).
+        ps = "sentenceInBlock(" + str(blockID) + "_" + str(sentenceID) + \
+             "," + str(blockID) + ")."
+        facts.append(ps)
+
+        wordID = 1
+        tokens = nltk.word_tokenize(sentence)
+        nWords = len(tokens)
+        wBeginning = nWords/float(3)
+        wEnding = (2*nWords)/float(3)
+
+        for word in tokens:
+
+            # mode: wordString(wordID, #str).
+            ps = "wordString(" + str(blockID) + "_" + str(sentenceID) + \
+                 "_" + str(wordID) + "," + "'" + str(word) + "')."
+            facts.append(ps)
+
+            # mode: partOfSpeechTag(wordID, #POS).
+            POS = nltk.pos_tag([word])[0][1]
+            ps = "partOfSpeech(" + str(blockID) + "_" + str(sentenceID) + \
+                 "_" + str(wordID) + "," + '"' + str(POS) + '").'
+            facts.append(ps)
+
+            # mode: nextWordInSentence(sentenceID, wordID, wordID).
+            if wordID < nWords:
+                ps = "nextWordInSentence(" + str(blockID) + "_" + \
+                     str(sentenceID) + "," + str(blockID) + "_" + \
                      str(sentenceID) + "_" + str(wordID) + "," + \
-                     str(blockID) + "_" + str(sentenceID) + ")."
-                _writeFact(ps)
-                _writeWordFromSentenceInBlock(word, blockID,
-                                              sentenceID, wordID)
-                wordID += 1
-            sentenceID += 1
-        blockID += 1
+                     str(blockID) + "_" + str(sentenceID) + "_" + \
+                     str(wordID+1) + ")."
+                facts.append(ps)
+
+            if wordID < wBeginning:
+                # mode: earlyWordInSentence(sentenceID, wordID).
+                ps = "earlyWordInSentence(" + str(blockID) + "_" + \
+                     str(sentenceID) + "," + str(blockID) + "_" + \
+                     str(sentenceID) + "_" + str(wordID) + ")."
+                facts.append(ps)
+            elif wordID > wEnding:
+                # mode: lateWordInSentence(sentenceID< wordID).
+                ps = "lateWordInSentence(" + str(blockID) + "_" + \
+                     str(sentenceID) + "," + str(blockID) + "_" + \
+                     str(sentenceID) + "_" + str(wordID) + ")."
+                facts.append(ps)
+            else:
+                # mode: midWayWordInSentence(sentenceID, wordID).
+                ps = "midWayWordInSentence(" + str(blockID) + "_" + \
+                     str(sentenceID) + "," + str(blockID) + "_" + \
+                     str(sentenceID) + "_" + str(wordID) + ")."
+                facts.append(ps)
+
+            # mode: wordInSentence(wordID, sentenceID).
+            ps = "wordInSentence(" + str(blockID) + "_" + \
+                 str(sentenceID) + "_" + str(wordID) + "," + \
+                 str(blockID) + "_" + str(sentenceID) + ")."
+            facts.append(ps)
+            wordID += 1
+        sentenceID += 1
+
+    return facts
diff --git a/rnlp/tests/requirements.txt b/rnlp/tests/requirements.txt
@@ -4,3 +4,4 @@ pytest-cov
 unittest2
 nltk
 tqdm
+joblib
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,4 @@ pytest-cov @@
     unittest2
     nltk
     tqdm
+    joblib