diff --git a/core/src/main/java/com/twitter/elephantbird/mapreduce/input/LzoInputFormat.java b/core/src/main/java/com/twitter/elephantbird/mapreduce/input/LzoInputFormat.java index 1971151a7..81e306d0b 100644 --- a/core/src/main/java/com/twitter/elephantbird/mapreduce/input/LzoInputFormat.java +++ b/core/src/main/java/com/twitter/elephantbird/mapreduce/input/LzoInputFormat.java @@ -6,6 +6,7 @@ import java.util.List; import com.twitter.elephantbird.util.HadoopCompat; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -29,6 +30,8 @@ public abstract class LzoInputFormat extends FileInputFormat { private static final Logger LOG = LoggerFactory.getLogger(LzoInputFormat.class); + public static final String READ_INDEXES_KEY = "elephantbird.index.read"; + private final PathFilter hiddenPathFilter = new PathFilter() { // avoid hidden files and directories. @Override @@ -112,6 +115,11 @@ protected boolean isSplitable(JobContext context, Path filename) { public List getSplits(JobContext job) throws IOException { List defaultSplits = super.getSplits(job); + Configuration config = HadoopCompat.getConfiguration(job); + + boolean readIndexes = config.getBoolean(READ_INDEXES_KEY, true); + if (!readIndexes) return defaultSplits; + // Find new starts and ends of the file splits that align with the lzo blocks. List result = new ArrayList(); @@ -127,7 +135,7 @@ public List getSplits(JobContext job) throws IOException { if ( file.equals(prevFile) ) { index = prevIndex; } else { - index = LzoIndex.readIndex(file.getFileSystem(HadoopCompat.getConfiguration(job)), file); + index = LzoIndex.readIndex(file.getFileSystem(config), file); prevFile = file; prevIndex = index; }