List of usage examples for org.apache.hadoop.fs Path getFileSystem
public FileSystem getFileSystem(Configuration conf) throws IOException
From source file:bixo.examples.webmining.DemoWebMiningWorkflow.java
License:Apache License
public static Flow createWebMiningWorkflow(Path crawlDbPath, Path curLoopDirPath, FetcherPolicy fetcherPolicy, UserAgent userAgent, DemoWebMiningOptions options) throws IOException, InterruptedException { // Fetch at most 200 pages, max size of 128K, complete mode, from the current dir. // HTML only. // We want to extract the cleaned up HTML, and pass that to the parser, which will // be specified via options.getAnalyzer. From this we'll get outlinks, page score, and // any results. JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB); boolean isLocal = HadoopUtils.isJobLocal(conf); int numReducers = HadoopUtils.getNumReducers(conf); conf.setNumReduceTasks(numReducers); conf.setInt("mapred.min.split.size", 64 * 1024 * 1024); Properties props = HadoopUtils.getDefaultProperties(DemoWebMiningWorkflow.class, false, conf); FileSystem fs = crawlDbPath.getFileSystem(conf); // Input : the crawldb if (!fs.exists(crawlDbPath)) { throw new RuntimeException("CrawlDb not found"); }//from www .j a v a 2 s .c o m Tap inputSource = new Hfs(new TextDelimited(CrawlDbDatum.FIELDS, "\t", CrawlDbDatum.TYPES), crawlDbPath.toString()); Pipe importPipe = new Pipe("import pipe"); // Split into tuples that are to be fetched and that have already been fetched SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedSSCrawlDatums()); Pipe finishedDatumsFromDb = new Pipe("finished datums from db", splitter.getRHSPipe()); Pipe urlsToFetchPipe = splitter.getLHSPipe(); // Limit to MAX_DISTRIBUTED_FETCH if running in real cluster, // or MAX_LOCAL_FETCH if running locally. So first we sort the entries // from high to low by links score. // TODO add unit test urlsToFetchPipe = new GroupBy(urlsToFetchPipe, new Fields(CrawlDbDatum.LINKS_SCORE_FIELD), true); long maxToFetch = HadoopUtils.isJobLocal(conf) ? MAX_LOCAL_FETCH : MAX_DISTRIBUTED_FETCH; urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbDatum(maxToFetch)); BaseScoreGenerator scorer = new LinkScoreGenerator(); // Create the sub-assembly that runs the fetch job int maxThreads = isLocal ? CrawlConfig.DEFAULT_NUM_THREADS_LOCAL : CrawlConfig.DEFAULT_NUM_THREADS_CLUSTER; SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent); fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES); fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT); fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT); FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers); Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe()); Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe()); contentPipe = TupleLogger.makePipe(contentPipe, true); // Create a parser that returns back the raw HTML (cleaned up by Tika) as the parsed content. SimpleParser parser = new SimpleParser(new ParserPolicy(), true); ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), parser); Pipe analyzerPipe = new Pipe("analyzer pipe"); analyzerPipe = new Each(parsePipe.getTailPipe(), new AnalyzeHtml()); Pipe outlinksPipe = new Pipe("outlinks pipe", analyzerPipe); outlinksPipe = new Each(outlinksPipe, new CreateLinkDatumFromOutlinksFunction()); Pipe resultsPipe = new Pipe("results pipe", analyzerPipe); resultsPipe = new Each(resultsPipe, new CreateResultsFunction()); // Group the finished datums, the skipped datums, status, outlinks Pipe updatePipe = new CoGroup("update pipe", Pipe.pipes(finishedDatumsFromDb, statusPipe, analyzerPipe, outlinksPipe), Fields.fields(new Fields(CrawlDbDatum.URL_FIELD), new Fields(StatusDatum.URL_FN), new Fields(AnalyzedDatum.URL_FIELD), new Fields(LinkDatum.URL_FN)), null, new OuterJoin()); updatePipe = new Every(updatePipe, new UpdateCrawlDbBuffer(), Fields.RESULTS); // output : loop dir specific crawldb Path outCrawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); Tap crawlDbSink = new Hfs(new TextLine(), outCrawlDbPath.toString()); // Status, Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString()); // Content Path contentDirPath = new Path(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME); Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString()); // PageResults Path resultsDirPath = new Path(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME); Tap resultsSink = new Hfs(new TextLine(), resultsDirPath.toString()); // Create the output map that connects each tail pipe to the appropriate sink. Map<String, Tap> sinkMap = new HashMap<String, Tap>(); sinkMap.put(updatePipe.getName(), crawlDbSink); sinkMap.put(statusPipe.getName(), statusSink); sinkMap.put(contentPipe.getName(), contentSink); sinkMap.put(resultsPipe.getName(), resultsSink); FlowConnector flowConnector = new FlowConnector(props); Flow flow = flowConnector.connect(inputSource, sinkMap, updatePipe, statusPipe, contentPipe, resultsPipe); return flow; }
From source file:bixo.examples.webmining.WebMiningTool.java
License:Apache License
public static void main(String[] args) throws IOException { WebMiningOptions options = new WebMiningOptions(); CmdLineParser parser = new CmdLineParser(options); try {/*from www .j a v a 2 s .c om*/ parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Build and run the flow. try { Path workingDirPath = new Path(options.getWorkingDir()); JobConf conf = new JobConf(); FileSystem fs = workingDirPath.getFileSystem(conf); setupWorkingDir(fs, workingDirPath, CrawlConfig.SEED_URLS_FILENAME); Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath); if (latestDirPath == null) { error("No previous cycle output dirs exist in " + workingDirPath, parser); } Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); FetcherPolicy fetcherPolicy = new FetcherPolicy(); fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY); fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT); // We only care about mime types that the Tika HTML parser can handle, // so restrict it to the same. Set<String> validMimeTypes = new HashSet<String>(); Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext()); for (MediaType supportedType : supportedTypes) { validMimeTypes.add(String.format("%s/%s", supportedType.getType(), supportedType.getSubtype())); } fetcherPolicy.setValidMimeTypes(validMimeTypes); // Let's limit our crawl to two loops for (int curLoop = 1; curLoop <= 2; curLoop++) { Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, curLoop); Flow flow = WebMiningWorkflow.createWebMiningWorkflow(crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options, curLoop == 1); flow.complete(); // Update crawlDbPath to point to the latest crawl db crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); } } catch (Exception e) { System.err.println("Exception running job: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } }
From source file:bixo.examples.webmining.WebMiningWorkflow.java
License:Apache License
public static void importSeedUrls(Path crawlDbPath, String fileName) throws IOException, InterruptedException { SimpleUrlNormalizer normalizer = new SimpleUrlNormalizer(); JobConf defaultJobConf = HadoopUtils.getDefaultJobConf(); InputStream is = null;// w w w. ja v a 2s . c om TupleEntryCollector writer = null; try { Tap urlSink = new Hfs(new TextLine(), crawlDbPath.toString(), true); writer = urlSink.openForWrite(defaultJobConf); is = WebMiningWorkflow.class.getResourceAsStream(fileName); if (is == null) { throw new FileNotFoundException("The seed urls file doesn't exist"); } List<String> lines = IOUtils.readLines(is); for (String line : lines) { line = line.trim(); if (line.startsWith("#")) { continue; } CrawlDbDatum datum = new CrawlDbDatum(normalizer.normalize(line), 0, UrlStatus.UNFETCHED, 0.0f, 0.0f); writer.add(datum.getTuple()); } writer.close(); } catch (IOException e) { HadoopUtils.safeRemove(crawlDbPath.getFileSystem(defaultJobConf), crawlDbPath); throw e; } finally { IoUtils.safeClose(is); if (writer != null) { writer.close(); } } }
From source file:bixo.examples.webmining.WebMiningWorkflow.java
License:Apache License
public static Flow createWebMiningWorkflow(Path crawlDbPath, Path curLoopDirPath, FetcherPolicy fetcherPolicy, UserAgent userAgent, WebMiningOptions options, boolean resetSolr) throws IOException, InterruptedException { // Fetch at most 200 pages, max size of 128K, complete mode, from the current dir. // HTML only. // We want to extract the cleaned up HTML, and pass that to the parser, which will // be specified via options.getAnalyzer. From this we'll get outlinks, page score, and // any results. JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB); boolean isLocal = HadoopUtils.isJobLocal(conf); int numReducers = 1; // we always want to use a single reducer, to avoid contention conf.setNumReduceTasks(numReducers); conf.setInt("mapred.min.split.size", 64 * 1024 * 1024); Properties props = HadoopUtils.getDefaultProperties(WebMiningWorkflow.class, false, conf); FileSystem fs = crawlDbPath.getFileSystem(conf); // Input : the crawldb if (!fs.exists(crawlDbPath)) { throw new RuntimeException("CrawlDb not found"); }// w w w . j a va 2s .com Tap inputSource = new Hfs(new TextDelimited(CrawlDbDatum.FIELDS, "\t", CrawlDbDatum.TYPES), crawlDbPath.toString()); Pipe importPipe = new Pipe("import pipe"); // Split into tuples that are to be fetched and that have already been fetched SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedSSCrawlDatums()); Pipe finishedDatumsFromDb = new Pipe("finished datums from db", splitter.getRHSPipe()); Pipe urlsToFetchPipe = splitter.getLHSPipe(); // Limit to MAX_DISTRIBUTED_FETCH if running in real cluster, // or MAX_LOCAL_FETCH if running locally. So first we sort the entries // from high to low by links score. // TODO add unit test urlsToFetchPipe = new GroupBy(urlsToFetchPipe, new Fields(CrawlDbDatum.LINKS_SCORE_FIELD), true); long maxToFetch = HadoopUtils.isJobLocal(conf) ? MAX_LOCAL_FETCH : MAX_DISTRIBUTED_FETCH; urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbDatum(maxToFetch)); BaseScoreGenerator scorer = new LinkScoreGenerator(); // Create the sub-assembly that runs the fetch job int maxThreads = isLocal ? CrawlConfig.DEFAULT_NUM_THREADS_LOCAL : CrawlConfig.DEFAULT_NUM_THREADS_CLUSTER; SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent); fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES); fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT); fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT); FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers); Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe()); Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe()); contentPipe = TupleLogger.makePipe(contentPipe, true); // Create a parser that returns back the raw HTML (cleaned up by Tika) as the parsed content. SimpleParser parser = new SimpleParser(new ParserPolicy(), true); ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), parser); Pipe analyzerPipe = new Pipe("analyzer pipe"); analyzerPipe = new Each(parsePipe.getTailPipe(), new AnalyzeHtml()); Pipe outlinksPipe = new Pipe("outlinks pipe", analyzerPipe); outlinksPipe = new Each(outlinksPipe, new CreateLinkDatumFromOutlinksFunction()); Pipe resultsPipe = new Pipe("results pipe", analyzerPipe); resultsPipe = new Each(resultsPipe, new CreateResultsFunction()); // Group the finished datums, the skipped datums, status, outlinks Pipe updatePipe = new CoGroup("update pipe", Pipe.pipes(finishedDatumsFromDb, statusPipe, analyzerPipe, outlinksPipe), Fields.fields(new Fields(CrawlDbDatum.URL_FIELD), new Fields(StatusDatum.URL_FN), new Fields(AnalyzedDatum.URL_FIELD), new Fields(LinkDatum.URL_FN)), null, new OuterJoin()); updatePipe = new Every(updatePipe, new UpdateCrawlDbBuffer(), Fields.RESULTS); // output : loop dir specific crawldb Path outCrawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); Tap crawlDbSink = new Hfs(new TextLine(), outCrawlDbPath.toString()); // Status, Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString()); // Content Path contentDirPath = new Path(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME); Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString()); // PageResults Path resultsDirPath = new Path(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME); Tap resultsSink = new Hfs(new TextLine(), resultsDirPath.toString()); // Create the output map that connects each tail pipe to the appropriate sink. Map<String, Tap> sinkMap = new HashMap<String, Tap>(); sinkMap.put(updatePipe.getName(), crawlDbSink); sinkMap.put(statusPipe.getName(), statusSink); sinkMap.put(contentPipe.getName(), contentSink); sinkMap.put(resultsPipe.getName(), resultsSink); FlowConnector flowConnector = new FlowConnector(props); Flow flow = flowConnector.connect(inputSource, sinkMap, updatePipe, statusPipe, contentPipe, resultsPipe); return flow; }
From source file:boa.aggregators.MLAggregator.java
License:Apache License
public void saveModel(Object model) { FSDataOutputStream out = null;/*from w ww . ja v a2 s . com*/ FileSystem fileSystem = null; Path filePath = null; try { JobContext context = (JobContext) getContext(); Configuration configuration = context.getConfiguration(); int boaJobId = configuration.getInt("boa.hadoop.jobid", 0); JobConf job = new JobConf(configuration); Path outputPath = FileOutputFormat.getOutputPath(job); fileSystem = outputPath.getFileSystem(context.getConfiguration()); fileSystem.mkdirs(new Path("/boa", new Path("" + boaJobId))); filePath = new Path("/boa", new Path("" + boaJobId, new Path(("" + getKey()).split("\\[")[0] + "ML.model"))); if (fileSystem.exists(filePath)) return; out = fileSystem.create(filePath); ByteArrayOutputStream byteOutStream = new ByteArrayOutputStream(); ObjectOutputStream objectOut = new ObjectOutputStream(byteOutStream); objectOut.writeObject(model); objectOut.close(); byte[] serializedObject = byteOutStream.toByteArray(); out.write(serializedObject, 0, serializedObject.length); this.collect(filePath.toString()); } catch (Exception e) { e.printStackTrace(); } finally { try { if (out != null) out.close(); } catch (final Exception e) { e.printStackTrace(); } } }
From source file:boa.datagen.SeqSortMerge.java
License:Apache License
private static long getLen(FileStatus file) throws IOException { Path path = new Path(file.getPath(), "part-00000"); long len = path.getFileSystem(conf).getFileStatus(path).getLen(); return len;//from w ww .j a va2s .c om }
From source file:bucket_sort.NLineInputFormat.java
License:Apache License
public static List<FileSplit> getSplitsForFile(FileStatus status, Configuration conf, int numLinesPerSplit) throws IOException { List<FileSplit> splits = new ArrayList<FileSplit>(); Path fileName = status.getPath(); if (status.isDir()) { throw new IOException("Not a file: " + fileName); }//from w w w . ja v a2 s . c o m FileSystem fs = fileName.getFileSystem(conf); LineReader lr = null; try { FSDataInputStream in = fs.open(fileName); lr = new LineReader(in, conf); Text line = new Text(); int numLines = 0; long begin = 0; long length = 0; int num = -1; while ((num = lr.readLine(line)) > 0) { numLines++; length += num; if (numLines == numLinesPerSplit) { // NLineInputFormat uses LineRecordReader, which always reads // (and consumes) at least one character out of its upper split // boundary. So to make sure that each mapper gets N lines, we // move back the upper split limits of each split // by one character here. if (begin == 0) { splits.add(new FileSplit(fileName, begin, length - 1, new String[] {})); } else { splits.add(new FileSplit(fileName, begin - 1, length, new String[] {})); } begin += length; length = 0; numLines = 0; } } if (numLines != 0) { splits.add(new FileSplit(fileName, begin, length, new String[] {})); } } finally { if (lr != null) { lr.close(); } } return splits; }
From source file:byte_import.HexastoreBulkImport.java
License:Open Source License
private void loadHFiles() throws Exception { conf = HBaseConfiguration.create();//w w w . ja v a2s . c o m HBaseAdmin hadmin = new HBaseAdmin(conf); Path hfofDir = new Path("out"); FileSystem fs = hfofDir.getFileSystem(conf); //if (!fs.exists(hfofDir)) { // throw new FileNotFoundException("HFileOutputFormat dir " + // hfofDir + " not found"); //} FileStatus[] familyDirStatuses = fs.listStatus(hfofDir); //if (familyDirStatuses == null) { // throw new FileNotFoundException("No families found in " + hfofDir); //} int length = 0; byte[][] splits = new byte[18000][]; for (FileStatus stat : familyDirStatuses) { if (!stat.isDir()) { continue; } Path familyDir = stat.getPath(); // Skip _logs, etc if (familyDir.getName().startsWith("_")) continue; //byte[] family = familyDir.getName().getBytes(); Path[] hfiles = FileUtil.stat2Paths(fs.listStatus(familyDir)); for (Path hfile : hfiles) { if (hfile.getName().startsWith("_")) continue; HFile.Reader hfr = HFile.createReader(fs, hfile, new CacheConfig(conf)); //HFile.Reader hfr = new HFile.Reader(fs, hfile, null, false); final byte[] first; try { hfr.loadFileInfo(); first = hfr.getFirstRowKey(); } finally { hfr.close(); } splits[length] = first.clone(); length++; } } //System.out.println(length); byte[][] splits1 = new byte[length][]; for (int i = 0; i < splits1.length; i++) { splits1[i] = splits[i]; } Arrays.sort(splits1, Bytes.BYTES_COMPARATOR); //HTableDescriptor desc = new HTableDescriptor("H2RDF"); HTableDescriptor desc = new HTableDescriptor(TABLE_NAME); HColumnDescriptor family = new HColumnDescriptor("A"); desc.addFamily(family); //for (int i = 0; i < splits.length; i++) { // System.out.println(Bytes.toStringBinary(splits[i])); //} conf.setInt("zookeeper.session.timeout", 600000); if (hadmin.tableExists(TABLE_NAME)) { hadmin.disableTable(TABLE_NAME); hadmin.deleteTable(TABLE_NAME); } else { hadmin.createTable(desc, splits1); } //hadmin.createTable(desc); String[] args1 = new String[2]; args1[0] = "out"; args1[1] = TABLE_NAME; //args1[1]="new2"; ToolRunner.run(new LoadIncrementalHFiles(HBaseConfiguration.create()), args1); }
From source file:ca.sparkera.adapters.mapred.MainframeVBInputFormat.java
License:Apache License
/** * Splits files returned by {@link #listStatus(JobConf)} when they're too * big.// w w w . j a v a 2s. co m */ @Override @SuppressWarnings("deprecation") public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { FileStatus[] files = listStatus(job); for (FileStatus file : files) { // check we have valid files if (file.isDir()) { throw new IOException("Not a file: " + file.getPath()); } totalSize += file.getLen(); } long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits); long minSize = Math.max(job.getLong("mapred.min.split.size", 1), minSplitSize); // generate splits ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); for (FileStatus file : files) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job); FSDataInputStream fileIn; InputStream inputStream; fileIn = fs.open(path); inputStream = fileIn; filePosition = fileIn; long offset = 0; long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(fs, path)) { long blockSize = file.getBlockSize(); long bytesRemaining = length; long splitSize = 0; while (offset < length) { splitSize = computeSplitSize(goalSize, minSize, blockSize, inputStream); int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; offset = length - bytesRemaining; } if (bytesRemaining != 0) { throw new IOException( "Partial record(length = " + bytesRemaining + ") found at the end of file " + path); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { // Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } if (inputStream != null) { inputStream.close(); inputStream = null; } } java.util.Date date = new java.util.Date(); System.out.println((new Timestamp(date.getTime())) + ",\t Split = 100% Total Splits - " + (++splitCount) + "\t Total Records in VB file - " + totalRecords); LOG.debug("Total # of splits: " + splits.size()); return splits.toArray(new FileSplit[splits.size()]); }
From source file:ca.sparkera.adapters.mapreduce.MainframeVBRecordReader.java
License:Apache License
public void initialize(Configuration job, long splitStart, long splitLength, Path file) throws IOException { start = splitStart;/*from w w w .ja v a 2 s .c om*/ end = start + splitLength; LOG.info("Start of the split:" + start + "-End of split:" + end); LOG.debug("VLR initialize started: start pos:" + start + "endpos:" + end); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file); if (null != codec) { isCompressedInput = true; decompressor = CodecPool.getDecompressor(codec); CompressionInputStream cIn = codec.createInputStream(fileIn, decompressor); filePosition = cIn; inputStream = cIn; LOG.info("Compressed input; cannot compute number of records in the split"); } else { fileIn.seek(start); filePosition = fileIn; inputStream = fileIn; numBytesRemainingInSplit = splitLength; LOG.info("Variable length input; cannot compute number of records in the split"); } this.pos = start; }