dispy : Distribute Computations for Parallel Execution
dispy Examples
-
A simple version of word count example from MapReduce:
#!/usr/bin/env python
# a version of word frequency example from mapreduce tutorial
def mapper(doc):
# input reader and map function are combined
import os
words = []
with open(os.path.join('/tmp', doc)) as fd:
for line in fd:
words.extend((word.lower(), 1) for word in line.split() \
if len(word) > 3 and word.isalpha())
return words
def reducer(words):
# we should generate sorted lists which are then merged,
# but to keep things simple, we use dicts
word_count = {}
for word, count in words:
if word not in word_count:
word_count[word] = 0
word_count[word] += count
# print 'reducer: %s to %s' % (len(words), len(word_count))
return word_count
if __name__ == '__main__':
import dispy, logging
# map
# nodes node1 and node2 have 'doc1', 'doc2' etc. on their
# local storage under /tmp, so no need to transfer them
map_cluster = dispy.JobCluster(mapper, nodes=['node1', 'node2'], pulse_interval=2,
resubmit=True)
reduce_cluster = dispy.JobCluster(reducer, nodes=['*'], pulse_interval=2,
resubmit=True)
map_jobs = []
for f in ['doc1', 'doc2', 'doc3', 'doc4', 'doc5']:
job = map_cluster.submit(f)
map_jobs.append(job)
reduce_jobs = []
for map_job in map_jobs:
words = map_job()
if not words:
print map_job.exception
continue
# simple partition
n = 0
while n < len(words):
m = min(len(words) - n, 1000)
reduce_job = reduce_cluster.submit(words[n:n+m])
reduce_jobs.append(reduce_job)
n += m
# reduce
word_count = {}
for reduce_job in reduce_jobs:
words = reduce_job()
if not words:
print reduce_job.exception
continue
for word, count in words.iteritems():
if word not in word_count:
word_count[word] = 0
word_count[word] += count
# sort words by frequency and print
for word in sorted(word_count, key=lambda x: word_count[x], reverse=True):
count = word_count[word]
print word, count
reduce_cluster.stats()