Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

makeIdentifiers: Iterating through blocks in parallel #12

Closed
wants to merge 7 commits into from
9 changes: 8 additions & 1 deletion rnlp/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@

parser.add_argument('-b', '--blockSize', type=int, default=2,
help='Set the block size')
parser.add_argument('-n', '--n_jobs', type=int, default=1,
help='''Set the number of cores to use,
-1 to use all cores. Default=1''')
file_or_dir.add_argument('-d', '--directory', type=str,
help='Read text from all files in a directory.')
file_or_dir.add_argument('-f', '--file', type=str,
Expand All @@ -78,6 +81,10 @@
n = args.blockSize
logger.info('blockSize specified as ' + str(n))

# Set the number of jobs to perform in parallel.
n_jobs = args.n_jobs
logger.info('n_jobs: ' + str(n_jobs))

# Set the input file(s).
if args.file:
chosenFile = args.file
Expand Down Expand Up @@ -112,7 +119,7 @@

# Make identifiers from the blocks.
try:
makeIdentifiers(blocks)
makeIdentifiers(blocks, n_jobs=n_jobs)
except Exception:
logger.error('Error while making identifiers.', exc_info=True)
exit(2)
Expand Down
229 changes: 117 additions & 112 deletions rnlp/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,10 @@

# Non-standard Python Library
import nltk
from joblib import Parallel
from joblib import delayed
from tqdm import tqdm

from .textprocessing import getSentences
from .textprocessing import getBlocks

Expand Down Expand Up @@ -111,14 +114,18 @@ def _writeBk(target="sentenceContainsTarget(+SID,+WID).", treeDepth="3",
return


def makeIdentifiers(blocks, target="sentenceContainsTarget(+SID,+WID).",
def makeIdentifiers(blocks, n_jobs=1,
target="sentenceContainsTarget(+SID,+WID).",
treeDepth="3", nodeSize="3", numOfClauses="8"):
"""
Make unique identifiers for components of the block and write to files.

:param blocks: Blocks of sentences (likely the output of
``textprocessing.getBlocks``).
:type blocks: list
:param n_jobs: Number of jobs to perform in parallel. -1 to use all
available cores. Default=1.
:type n_jobs: int.
:param target: Target to write to the background file (another option might
be ``blockContainsTarget(+BID,+SID).``).
:type target: str.
Expand Down Expand Up @@ -154,125 +161,123 @@ def makeIdentifiers(blocks, target="sentenceContainsTarget(+SID,+WID).",
# 100%|██████████████████████| 2/2 [00:00<00:00, 18.49it/s]
"""

blockID, sentenceID, wordID = 1, 0, 0

print("Creating background file...")

_writeBk(target=target, treeDepth=treeDepth,
nodeSize=nodeSize, numOfClauses=numOfClauses)

print("Creating identifiers from the blocks...")

nBlocks = len(blocks)
for block in tqdm(blocks):
n_jobs = n_jobs
fact_blocks = Parallel(n_jobs=n_jobs)(delayed(__makeIdentifiers)(blocks[i], i) for i in tqdm(range(len(blocks))))

_writeBlock(block, blockID)
with open('facts.txt', 'w') as f:
for block in fact_blocks:
for line in block:
f.write(line + '\n')

sentenceID = 1
nSentences = len(block)
beginning = nSentences/float(3)
ending = (2*nSentences)/float(3)

for sentence in block:
def __makeIdentifiers(block, blockID):
"""
Spreads out the process of makingIdentifiers over a set number of cores.

if sentenceID < nSentences:
# mode: nextSentenceInBlock(blockID, sentenceID, sentenceID).
ps = "nextSentenceInBlock(" + str(blockID) + "," + \
str(blockID) + "_" + str(sentenceID) + "," + \
str(blockID) + "_" + str(sentenceID+1) + ")."
_writeFact(ps)

if sentenceID < beginning:
# mode: earlySentenceInBlock(blockID, sentenceID).
ps = "earlySentenceInBlock(" + str(blockID) + "," + \
str(blockID) + "_" + str(sentenceID) + ")."
_writeFact(ps)
elif sentenceID > ending:
# mode: lateSentenceInBlock(blockID, sentenceID).
ps = "lateSentenceInBlock(" + str(blockID) + "," + \
str(blockID) + "_" + str(sentenceID) + ")."
_writeFact(ps)
else:
# mode: midWaySentenceInBlock(blockID, sentenceID).
ps = "earlySentenceInBlock(" + str(blockID) + "," + \
str(blockID) + "_" + str(sentenceID) + ")."
_writeFact(ps)

# mode: sentenceInBlock(sentenceID, blockID).
ps = "sentenceInBlock(" + str(blockID) + "_" + str(sentenceID) + \
"," + str(blockID) + ")."
_writeFact(ps)
_writeSentenceInBlock(sentence, blockID, sentenceID)

wordID = 1
tokens = nltk.word_tokenize(sentence)
nWords = len(tokens)
wBeginning = nWords/float(3)
wEnding = (2*nWords)/float(3)

for word in tokens:

"""
if word == "He":
pos = open("pos.txt","a")
word = str(blockID)+"_"+str(sentenceID)+"_"+str(wordID)
sentence = str(blockID)+"_"+str(sentenceID)
pos.write("sentenceContainsTarget("+sentence+","+word+").\n")
pos.close()
else:
neg = open("neg.txt","a")
word = str(blockID)+"_"+str(sentenceID)+"_"+str(wordID)
sentence = str(blockID)+"_"+str(sentenceID)
neg.write("sentenceContainsTarget("+sentence+","+word+").\n")
neg.close()
"""

# mode: wordString(wordID, #str).
ps = "wordString(" + str(blockID) + "_" + str(sentenceID) + \
"_" + str(wordID) + "," + "'" + str(word) + "')."
_writeFact(ps)

# mode: partOfSpeechTag(wordID, #POS).
POS = nltk.pos_tag([word])[0][1]
ps = "partOfSpeech(" + str(blockID) + "_" + str(sentenceID) + \
"_" + str(wordID) + "," + '"' + str(POS) + '").'
_writeFact(ps)

# mode: nextWordInSentence(sentenceID, wordID, wordID).
if wordID < nWords:
ps = "nextWordInSentence(" + str(blockID) + "_" + \
str(sentenceID) + "," + str(blockID) + "_" + \
str(sentenceID) + "_" + str(wordID) + "," + \
str(blockID) + "_" + str(sentenceID) + "_" + \
str(wordID+1) + ")."
_writeFact(ps)

if wordID < wBeginning:
# mode: earlyWordInSentence(sentenceID, wordID).
ps = "earlyWordInSentence(" + str(blockID) + "_" + \
str(sentenceID) + "," + str(blockID) + "_" + \
str(sentenceID) + "_" + str(wordID) + ")."
_writeFact(ps)
elif wordID > wEnding:
# mode: lateWordInSentence(sentenceID< wordID).
ps = "lateWordInSentence(" + str(blockID) + "_" + \
str(sentenceID) + "," + str(blockID) + "_" + \
str(sentenceID) + "_" + str(wordID) + ")."
_writeFact(ps)
else:
# mode: midWayWordInSentence(sentenceID, wordID).
ps = "midWayWordInSentence(" + str(blockID) + "_" + \
str(sentenceID) + "," + str(blockID) + "_" + \
str(sentenceID) + "_" + str(wordID) + ")."
_writeFact(ps)

# mode: wordInSentence(wordID, sentenceID).
ps = "wordInSentence(" + str(blockID) + "_" + \
:returns: A list of facts to be written to files.
:rtype: list
"""

# Initialize an empty list where the facts will be stored.
facts = []

# Initialize the blockID, sentenceID, and wordID
sentenceID, wordID = 0, 0

sentenceID = 1
nSentences = len(block)
beginning = nSentences/float(3)
ending = (2 * nSentences)/float(3)

for sentence in block:

if sentenceID < nSentences:
# mode: nextSentenceInBlock(blockID, sentenceID, sentenceID).
ps = "nextSentenceInBlock(" + str(blockID) + "," + \
str(blockID) + "_" + str(sentenceID) + "," + \
str(blockID) + "_" + str(sentenceID+1) + ")."
facts.append(ps)

if sentenceID < beginning:
# mode: earlySentenceInBlock(blockID, sentenceID).
ps = "earlySentenceInBlock(" + str(blockID) + "," + \
str(blockID) + "_" + str(sentenceID) + ")."
facts.append(ps)
elif sentenceID > ending:
# mode: lateSentenceInBlock(blockID, sentenceID).
ps = "lateSentenceInBlock(" + str(blockID) + "," + \
str(blockID) + "_" + str(sentenceID) + ")."
facts.append(ps)
else:
# mode: midWaySentenceInBlock(blockID, sentenceID).
ps = "earlySentenceInBlock(" + str(blockID) + "," + \
str(blockID) + "_" + str(sentenceID) + ")."
facts.append(ps)

# mode: sentenceInBlock(sentenceID, blockID).
ps = "sentenceInBlock(" + str(blockID) + "_" + str(sentenceID) + \
"," + str(blockID) + ")."
facts.append(ps)

wordID = 1
tokens = nltk.word_tokenize(sentence)
nWords = len(tokens)
wBeginning = nWords/float(3)
wEnding = (2*nWords)/float(3)

for word in tokens:

# mode: wordString(wordID, #str).
ps = "wordString(" + str(blockID) + "_" + str(sentenceID) + \
"_" + str(wordID) + "," + "'" + str(word) + "')."
facts.append(ps)

# mode: partOfSpeechTag(wordID, #POS).
POS = nltk.pos_tag([word])[0][1]
ps = "partOfSpeech(" + str(blockID) + "_" + str(sentenceID) + \
"_" + str(wordID) + "," + '"' + str(POS) + '").'
facts.append(ps)

# mode: nextWordInSentence(sentenceID, wordID, wordID).
if wordID < nWords:
ps = "nextWordInSentence(" + str(blockID) + "_" + \
str(sentenceID) + "," + str(blockID) + "_" + \
str(sentenceID) + "_" + str(wordID) + "," + \
str(blockID) + "_" + str(sentenceID) + ")."
_writeFact(ps)
_writeWordFromSentenceInBlock(word, blockID,
sentenceID, wordID)
wordID += 1
sentenceID += 1
blockID += 1
str(blockID) + "_" + str(sentenceID) + "_" + \
str(wordID+1) + ")."
facts.append(ps)

if wordID < wBeginning:
# mode: earlyWordInSentence(sentenceID, wordID).
ps = "earlyWordInSentence(" + str(blockID) + "_" + \
str(sentenceID) + "," + str(blockID) + "_" + \
str(sentenceID) + "_" + str(wordID) + ")."
facts.append(ps)
elif wordID > wEnding:
# mode: lateWordInSentence(sentenceID< wordID).
ps = "lateWordInSentence(" + str(blockID) + "_" + \
str(sentenceID) + "," + str(blockID) + "_" + \
str(sentenceID) + "_" + str(wordID) + ")."
facts.append(ps)
else:
# mode: midWayWordInSentence(sentenceID, wordID).
ps = "midWayWordInSentence(" + str(blockID) + "_" + \
str(sentenceID) + "," + str(blockID) + "_" + \
str(sentenceID) + "_" + str(wordID) + ")."
facts.append(ps)

# mode: wordInSentence(wordID, sentenceID).
ps = "wordInSentence(" + str(blockID) + "_" + \
str(sentenceID) + "_" + str(wordID) + "," + \
str(blockID) + "_" + str(sentenceID) + ")."
facts.append(ps)
wordID += 1
sentenceID += 1

return facts
1 change: 1 addition & 0 deletions rnlp/tests/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ pytest-cov
unittest2
nltk
tqdm
joblib