List of usage examples for org.apache.hadoop.mapred JobConf getFloat
public float getFloat(String name, float defaultValue)
name
property as a float
. From source file:org.apache.tez.mapreduce.hadoop.TestDeprecatedKeys.java
License:Apache License
@Test(timeout = 5000) /**//from w w w . jav a2s . c om * Set of keys that can be overriden at tez runtime */ public void verifyTezOverridenKeys() { JobConf jobConf = new JobConf(); jobConf.setInt(MRJobConfig.IO_SORT_FACTOR, 2000); jobConf.setInt(MRJobConfig.IO_SORT_MB, 100); jobConf.setInt(MRJobConfig.COUNTERS_MAX_KEY, 100); jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR, 1000); jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_MB, 200); jobConf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_READAHEAD, true); jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_READAHEAD_BYTES, 20); jobConf.setFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SORT_SPILL_PERCENT, 0.2f); jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_INDEX_CACHE_MEMORY_LIMIT_BYTES, 10); jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_COMBINE_MIN_SPILLS, 20); jobConf.setInt(Constants.TEZ_RUNTIME_TASK_MEMORY, 10); jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_PARALLEL_COPIES, 10); jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_FETCH_FAILURES_LIMIT, 10); jobConf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_NOTIFY_READERROR, true); jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_CONNECT_TIMEOUT, 10); jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_READ_TIMEOUT, 10); jobConf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_ENABLE_SSL, true); jobConf.setFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_FETCH_BUFFER_PERCENT, 10.0f); jobConf.setFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MEMORY_LIMIT_PERCENT, 10.0f); jobConf.setFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MERGE_PERCENT, 10.0f); jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MEMTOMEM_SEGMENTS, 10); jobConf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_ENABLE_MEMTOMEM, true); jobConf.setFloat(TezRuntimeConfiguration.TEZ_RUNTIME_INPUT_POST_MERGE_BUFFER_PERCENT, 10.0f); jobConf.set(TezRuntimeConfiguration.TEZ_RUNTIME_INTERNAL_SORTER_CLASS, "DefaultSorter"); jobConf.set(TezRuntimeConfiguration.TEZ_RUNTIME_GROUP_COMPARATOR_CLASS, "groupComparator"); jobConf.set(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_SECONDARY_COMPARATOR_CLASS, "SecondaryComparator"); jobConf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, false); jobConf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_COMPRESS, true); MRHelpers.translateMRConfToTez(jobConf); assertEquals(1000, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR, 0)); assertEquals(200, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_MB, 100)); assertEquals(true, jobConf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_READAHEAD, false)); assertEquals(20, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_READAHEAD_BYTES, 0)); assertEquals(10, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_INDEX_CACHE_MEMORY_LIMIT_BYTES, 0)); assertEquals(20, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_COMBINE_MIN_SPILLS, 0)); assertEquals(10, jobConf.getInt(Constants.TEZ_RUNTIME_TASK_MEMORY, 0)); assertEquals(10, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_PARALLEL_COPIES, 0)); assertEquals(10, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_FETCH_FAILURES_LIMIT, 0)); assertEquals(true, jobConf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_NOTIFY_READERROR, false)); assertEquals(10, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_CONNECT_TIMEOUT, 0)); assertEquals(10, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_READ_TIMEOUT, 0)); assertEquals(true, jobConf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_ENABLE_SSL, false)); assertEquals(10.0f, jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_FETCH_BUFFER_PERCENT, 0.0f), 0.0f); assertEquals(10.0f, jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MEMORY_LIMIT_PERCENT, 0.0f), 0.0f); assertEquals(10.0f, jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MERGE_PERCENT, 0.0f), 0.0f); assertEquals(10, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MEMTOMEM_SEGMENTS, 0)); assertEquals(true, jobConf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_ENABLE_MEMTOMEM, false)); assertEquals(10.0f, jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_INPUT_POST_MERGE_BUFFER_PERCENT, 0.0f), 0.0f); assertEquals("DefaultSorter", jobConf.get(TezRuntimeConfiguration.TEZ_RUNTIME_INTERNAL_SORTER_CLASS, "")); assertEquals("groupComparator", jobConf.get(TezRuntimeConfiguration.TEZ_RUNTIME_GROUP_COMPARATOR_CLASS, "")); assertEquals("SecondaryComparator", jobConf.get(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_SECONDARY_COMPARATOR_CLASS, "")); assertEquals("DefaultSorter", jobConf.get(TezRuntimeConfiguration.TEZ_RUNTIME_INTERNAL_SORTER_CLASS, "")); assertTrue(jobConf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_COMPRESS, false)); assertNull(jobConf.get(MRConfig.MAPRED_IFILE_READAHEAD)); assertNull(jobConf.get(MRConfig.MAPRED_IFILE_READAHEAD_BYTES)); assertNull(jobConf.get(MRJobConfig.RECORDS_BEFORE_PROGRESS)); assertNull(jobConf.get(MRJobConfig.IO_SORT_FACTOR)); assertNull(jobConf.get(MRJobConfig.IO_SORT_MB)); assertNull(jobConf.get(MRJobConfig.SHUFFLE_READ_TIMEOUT)); assertNull(jobConf.get(MRJobConfig.INDEX_CACHE_MEMORY_LIMIT)); assertNull(jobConf.get(MRJobConfig.MAP_COMBINE_MIN_SPILLS)); assertNull(jobConf.get(MRJobConfig.REDUCE_MEMORY_TOTAL_BYTES)); assertNull(jobConf.get(MRJobConfig.SHUFFLE_PARALLEL_COPIES)); assertNull(jobConf.get(MRJobConfig.SHUFFLE_FETCH_FAILURES)); assertNull(jobConf.get(MRJobConfig.SHUFFLE_NOTIFY_READERROR)); assertNull(jobConf.get(MRJobConfig.SHUFFLE_CONNECT_TIMEOUT)); assertNull(jobConf.get(MRJobConfig.SHUFFLE_READ_TIMEOUT)); assertNull(jobConf.get(MRConfig.SHUFFLE_SSL_ENABLED_KEY)); assertNull(jobConf.get(MRJobConfig.SHUFFLE_INPUT_BUFFER_PERCENT)); assertNull(jobConf.get(MRJobConfig.SHUFFLE_MEMORY_LIMIT_PERCENT)); assertNull(jobConf.get(MRJobConfig.REDUCE_MEMTOMEM_THRESHOLD)); assertNull(jobConf.get(MRJobConfig.REDUCE_MEMTOMEM_ENABLED)); assertNull(jobConf.get(MRJobConfig.REDUCE_INPUT_BUFFER_PERCENT)); assertNull(jobConf.get(MRJobConfig.GROUP_COMPARATOR_CLASS)); assertNull(jobConf.get(MRJobConfig.GROUP_COMPARATOR_CLASS)); assertNull(jobConf.get("map.sort.class")); }
From source file:org.wikimedia.wikihadoop.StreamWikiDumpInputFormat.java
License:Apache License
public List<InputSplit> getSplits(JobConf job, FileStatus file, String pattern, long splitSize) throws IOException { NetworkTopology clusterMap = new NetworkTopology(); List<InputSplit> splits = new ArrayList<InputSplit>(); Path path = file.getPath();/* w w w .j a v a 2 s . c o m*/ long length = file.getLen(); FileSystem fs = file.getPath().getFileSystem(job); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(fs, path)) { long bytesRemaining = length; SeekableInputStream in = SeekableInputStream.getInstance(path, 0, length, fs, this.compressionCodecs); SplitCompressionInputStream is = in.getSplitCompressionInputStream(); long start = 0; long skip = 0; if (is != null) { start = is.getAdjustedStart(); length = is.getAdjustedEnd(); is.close(); in = null; } LOG.info("locations=" + Arrays.asList(blkLocations)); FileSplit split = null; Set<Long> processedPageEnds = new HashSet<Long>(); float factor = job.getFloat(KEY_SKIP_FACTOR, 1.2F); READLOOP: while (((double) bytesRemaining) / splitSize > factor && bytesRemaining > 0) { // prepare matcher ByteMatcher matcher; { long st = Math.min(start + skip + splitSize, length - 1); split = makeSplit(path, st, Math.min(splitSize, length - st), clusterMap, blkLocations); System.err.println("split move to: " + split); if (in != null) in.close(); if (split.getLength() <= 1) { break; } in = SeekableInputStream.getInstance(split, fs, this.compressionCodecs); SplitCompressionInputStream cin = in.getSplitCompressionInputStream(); } matcher = new ByteMatcher(in); // read until the next page end in the look-ahead split boolean reach = false; while (!matcher.readUntilMatch(pageEndPattern, null, split.getStart() + split.getLength())) { if (matcher.getPos() >= length || split.getLength() == length - split.getStart()) break READLOOP; reach = false; split = makeSplit(path, split.getStart(), Math.min(split.getLength() + splitSize, length - split.getStart()), clusterMap, blkLocations); System.err.println("split extend to: " + split); } System.err.println( path + ": #" + splits.size() + " " + pageEndPattern + " found: pos=" + matcher.getPos() + " last=" + matcher.getLastUnmatchPos() + " read=" + matcher.getReadBytes() + " current=" + start + " remaining=" + bytesRemaining + " split=" + split); if (matcher.getLastUnmatchPos() > 0 && matcher.getPos() > matcher.getLastUnmatchPos() && !processedPageEnds.contains(matcher.getPos())) { splits.add(makeSplit(path, start, matcher.getPos() - start, clusterMap, blkLocations)); processedPageEnds.add(matcher.getPos()); long newstart = Math.max(matcher.getLastUnmatchPos(), start); bytesRemaining = length - newstart; start = newstart; skip = 0; } else { skip = matcher.getPos() - start; } } if (bytesRemaining > 0 && !processedPageEnds.contains(length)) { System.err.println( pageEndPattern + " remaining: pos=" + (length - bytesRemaining) + " end=" + length); splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } if (in != null) in.close(); } else if (length != 0) { splits.add(makeSplit(path, 0, length, clusterMap, blkLocations)); } else { //Create empty hosts array for zero length files splits.add(makeSplit(path, 0, length, new String[0])); } return splits; }
From source file:skewtune.mapreduce.STJobTracker.java
License:Apache License
@SuppressWarnings("unchecked") STJobTracker(final JobConf conf, String jobtrackerIndentifier) throws IOException, InterruptedException { // find the owner of the process // get the desired principal to load String keytabFilename = conf.get(JTConfig.JT_KEYTAB_FILE); UserGroupInformation.setConfiguration(conf); if (keytabFilename != null) { String desiredUser = conf.get(JTConfig.JT_USER_NAME, System.getProperty("user.name")); UserGroupInformation.loginUserFromKeytab(desiredUser, keytabFilename); mrOwner = UserGroupInformation.getLoginUser(); } else {//from ww w .j a va2s.c o m mrOwner = UserGroupInformation.getCurrentUser(); } supergroup = conf.get(MR_SUPERGROUP, "supergroup"); LOG.info("Starting jobtracker with owner as " + mrOwner.getShortUserName() + " and supergroup as " + supergroup); long secretKeyInterval = conf.getLong(MRConfig.DELEGATION_KEY_UPDATE_INTERVAL_KEY, MRConfig.DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT); long tokenMaxLifetime = conf.getLong(MRConfig.DELEGATION_TOKEN_MAX_LIFETIME_KEY, MRConfig.DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT); long tokenRenewInterval = conf.getLong(MRConfig.DELEGATION_TOKEN_RENEW_INTERVAL_KEY, MRConfig.DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT); secretManager = new DelegationTokenSecretManager(secretKeyInterval, tokenMaxLifetime, tokenRenewInterval, DELEGATION_TOKEN_GC_INTERVAL); secretManager.startThreads(); // // Grab some static constants // NUM_HEARTBEATS_IN_SECOND = conf.getInt(JT_HEARTBEATS_IN_SECOND, DEFAULT_NUM_HEARTBEATS_IN_SECOND); if (NUM_HEARTBEATS_IN_SECOND < MIN_NUM_HEARTBEATS_IN_SECOND) { NUM_HEARTBEATS_IN_SECOND = DEFAULT_NUM_HEARTBEATS_IN_SECOND; } HEARTBEATS_SCALING_FACTOR = conf.getFloat(JT_HEARTBEATS_SCALING_FACTOR, DEFAULT_HEARTBEATS_SCALING_FACTOR); if (HEARTBEATS_SCALING_FACTOR < MIN_HEARTBEATS_SCALING_FACTOR) { HEARTBEATS_SCALING_FACTOR = DEFAULT_HEARTBEATS_SCALING_FACTOR; } // whether to dump or not every heartbeat message even when DEBUG is enabled dumpHeartbeat = conf.getBoolean(JT_HEARTBEATS_DUMP, false); // This is a directory of temporary submission files. We delete it // on startup, and can delete any files that we're done with this.conf = conf; JobConf jobConf = new JobConf(conf); // Set ports, start RPC servers, setup security policy etc. InetSocketAddress addr = getAddress(conf); this.localMachine = addr.getHostName(); this.port = addr.getPort(); int handlerCount = conf.getInt(JT_IPC_HANDLER_COUNT, 10); this.interTrackerServer = RPC.getServer(SkewTuneClientProtocol.class, this, addr.getHostName(), addr.getPort(), handlerCount, false, conf, secretManager); if (LOG.isDebugEnabled()) { Properties p = System.getProperties(); for (Iterator it = p.keySet().iterator(); it.hasNext();) { String key = (String) it.next(); String val = p.getProperty(key); LOG.debug("Property '" + key + "' is " + val); } } InetSocketAddress infoSocAddr = NetUtils .createSocketAddr(conf.get(JT_HTTP_ADDRESS, String.format("%s:0", this.localMachine))); String infoBindAddress = infoSocAddr.getHostName(); int tmpInfoPort = infoSocAddr.getPort(); this.startTime = System.currentTimeMillis(); infoServer = new HttpServer("job", infoBindAddress, tmpInfoPort, tmpInfoPort == 0, conf); infoServer.setAttribute("job.tracker", this); infoServer.addServlet("jobcompletion", "/completion", JobCompletionServlet.class); infoServer.addServlet("taskspeculation", "/speculation", SpeculationEventServlet.class); infoServer.addServlet("skewreport", "/skew", SkewReportServlet.class); infoServer.addServlet("tasksplit", "/split/*", SplitTaskServlet.class); infoServer.addServlet("tasksplitV2", "/splitV2/*", SplitTaskV2Servlet.class); infoServer.start(); this.trackerIdentifier = jobtrackerIndentifier; // The rpc/web-server ports can be ephemeral ports... // ... ensure we have the correct info this.port = interTrackerServer.getListenerAddress().getPort(); this.conf.set(JT_IPC_ADDRESS, (this.localMachine + ":" + this.port)); LOG.info("JobTracker up at: " + this.port); this.infoPort = this.infoServer.getPort(); this.conf.set(JT_HTTP_ADDRESS, infoBindAddress + ":" + this.infoPort); LOG.info("JobTracker webserver: " + this.infoServer.getPort()); this.defaultNotificationUrl = String.format("http://%s:%d/completion?jobid=$jobId&status=$jobStatus", infoBindAddress, this.infoPort); LOG.info("JobTracker completion URI: " + defaultNotificationUrl); // this.defaultSpeculationEventUrl = String.format("http://%s:%d/speculation?taskid=$taskId&remainTime=$taskRemainTime",infoBindAddress,this.infoPort); this.defaultSpeculationEventUrl = String.format("http://%s:%d/speculation?jobid=$jobId", infoBindAddress, this.infoPort); LOG.info("JobTracker speculation event URI: " + defaultSpeculationEventUrl); this.defaultSkewReportUrl = String.format("http://%s:%d/skew", infoBindAddress, this.infoPort); LOG.info("JobTracker skew report event URI: " + defaultSkewReportUrl); this.trackerHttp = String.format("http://%s:%d", infoBindAddress, this.infoPort); while (!Thread.currentThread().isInterrupted()) { try { // if we haven't contacted the namenode go ahead and do it if (fs == null) { fs = mrOwner.doAs(new PrivilegedExceptionAction<FileSystem>() { @Override public FileSystem run() throws IOException { return FileSystem.get(conf); } }); } // clean up the system dir, which will only work if hdfs is out // of safe mode if (systemDir == null) { systemDir = new Path(getSystemDir()); } try { FileStatus systemDirStatus = fs.getFileStatus(systemDir); if (!systemDirStatus.getOwner().equals(mrOwner.getShortUserName())) { throw new AccessControlException( "The systemdir " + systemDir + " is not owned by " + mrOwner.getShortUserName()); } if (!systemDirStatus.getPermission().equals(SYSTEM_DIR_PERMISSION)) { LOG.warn("Incorrect permissions on " + systemDir + ". Setting it to " + SYSTEM_DIR_PERMISSION); fs.setPermission(systemDir, new FsPermission(SYSTEM_DIR_PERMISSION)); } else { break; } } catch (FileNotFoundException fnf) { } // ignore } catch (AccessControlException ace) { LOG.warn("Failed to operate on " + JTConfig.JT_SYSTEM_DIR + "(" + systemDir + ") because of permissions."); LOG.warn("Manually delete the " + JTConfig.JT_SYSTEM_DIR + "(" + systemDir + ") and then start the JobTracker."); LOG.warn("Bailing out ... "); throw ace; } catch (IOException ie) { LOG.info("problem cleaning system directory: " + systemDir, ie); } Thread.sleep(FS_ACCESS_RETRY_PERIOD); } if (Thread.currentThread().isInterrupted()) { throw new InterruptedException(); } // initialize cluster variable cluster = new Cluster(this.conf); // now create a job client proxy jtClient = (ClientProtocol) RPC.getProxy(ClientProtocol.class, ClientProtocol.versionID, JobTracker.getAddress(conf), mrOwner, this.conf, NetUtils.getSocketFactory(conf, ClientProtocol.class)); new SpeculativeScheduler().start(); // initialize task event fetcher new TaskCompletionEventFetcher().start(); // Same with 'localDir' except it's always on the local disk. asyncDiskService = new MRAsyncDiskService(FileSystem.getLocal(conf), conf.getLocalDirs()); asyncDiskService.moveAndDeleteFromEachVolume(SUBDIR); // keep at least one asynchronous worker per CPU core int numProcs = Runtime.getRuntime().availableProcessors(); LOG.info("# of available processors = " + numProcs); int maxFactor = conf.getInt(JT_MAX_ASYNC_WORKER_FACTOR, 2); asyncWorkers = new ThreadPoolExecutor(numProcs, numProcs * maxFactor, 30, TimeUnit.SECONDS, new SynchronousQueue<Runnable>(true), new ThreadPoolExecutor.CallerRunsPolicy()); speculativeSplit = conf.getBoolean(JT_SPECULATIVE_SPLIT, false); }
From source file:wiki.hadoop.mapred.lib.input.StreamWikiDumpInputFormat.java
License:Apache License
public List<InputSplit> getSplits(JobConf job, FileStatus file, String pattern, long splitSize) throws IOException { NetworkTopology clusterMap = new NetworkTopology(); List<InputSplit> splits = new ArrayList<InputSplit>(); Path path = file.getPath();/* w w w. j av a 2 s.c o m*/ long length = file.getLen(); FileSystem fs = file.getPath().getFileSystem(job); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(fs, path)) { long bytesRemaining = length; SeekableInputStream in = SeekableInputStream.getInstance(path, 0, length, fs, this.compressionCodecs); SplitCompressionInputStream is = in.getSplitCompressionInputStream(); long start = 0; long skip = 0; if (is != null) { start = is.getAdjustedStart(); length = is.getAdjustedEnd(); is.close(); in = null; } LOG.info("locations=" + Arrays.asList(blkLocations)); FileSplit split = null; Set<Long> processedPageEnds = new HashSet<Long>(); float factor = job.getFloat(KEY_SKIP_FACTOR, 1.2F); READLOOP: while (((double) bytesRemaining) / splitSize > factor && bytesRemaining > 0) { // prepare matcher ByteMatcher matcher; { long st = Math.min(start + skip + splitSize, length - 1); split = makeSplit(path, st, Math.min(splitSize, length - st), clusterMap, blkLocations); System.err.println("split move to: " + split); if (in != null) in.close(); if (split.getLength() <= 1) { break; } in = SeekableInputStream.getInstance(split, fs, this.compressionCodecs); SplitCompressionInputStream cin = in.getSplitCompressionInputStream(); } matcher = new ByteMatcher(in, split.getStart()); // read until the next page end in the look-ahead split boolean reach = false; while (!matcher.readUntilMatch(pageEndPattern, null, split.getStart() + split.getLength())) { if (matcher.getPos() >= length || split.getLength() == length - split.getStart()) break READLOOP; reach = false; split = makeSplit(path, split.getStart(), Math.min(split.getLength() + splitSize, length - split.getStart()), clusterMap, blkLocations); System.err.println("split extend to: " + split); } System.err.println( path + ": #" + splits.size() + " " + pageEndPattern + " found: pos=" + matcher.getPos() + " last=" + matcher.getLastUnmatchPos() + " read=" + matcher.getReadBytes() + " current=" + start + " remaining=" + bytesRemaining + " split=" + split); if (matcher.getLastUnmatchPos() > 0 && matcher.getPos() > matcher.getLastUnmatchPos() && !processedPageEnds.contains(matcher.getPos())) { splits.add(makeSplit(path, start, matcher.getPos() - start, clusterMap, blkLocations)); processedPageEnds.add(matcher.getPos()); long newstart = Math.max(matcher.getLastUnmatchPos(), start); bytesRemaining = length - newstart; start = newstart; skip = 0; } else { skip = matcher.getPos() - start; } } if (bytesRemaining > 0 && !processedPageEnds.contains(length)) { System.err.println( pageEndPattern + " remaining: pos=" + (length - bytesRemaining) + " end=" + length); splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } if (in != null) in.close(); } else if (length != 0) { splits.add(makeSplit(path, 0, length, clusterMap, blkLocations)); } else { //Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } return splits; }