List of usage examples for org.apache.hadoop.fs FileSystem globStatus
public FileStatus[] globStatus(Path pathPattern) throws IOException
Return all the files that match filePattern and are not checksum files.
From source file:org.commoncrawl.service.pagerank.master.PageRankMaster.java
License:Open Source License
private int findListValidIteration(FileSystem fs, Path jobDataPath) throws IOException { // scan job directory for best value candidate Path valueSearchPattern = new Path(jobDataPath, "value_*-00000"); FileStatus candidates[] = fs.globStatus(valueSearchPattern); int lastValidIterationNo = -1; ArrayList<Path> iterationSpecificValues = new ArrayList<Path>(); for (FileStatus candidate : candidates) { // extract iteration portion of name String iterationStr = candidate.getPath().getName().substring("value_".length(), "value_".length() + 5); // parse try {/*from w w w . j a va2 s . c o m*/ int iterationId = NUMBER_FORMAT.parse(iterationStr).intValue(); // now see if we up to PR_NUM_SLAVES values Path iterationSpecificSearchPattern = new Path(jobDataPath, "value_" + iterationStr + "-*"); // count result FileStatus iterationSpecificEntires[] = fs.globStatus(iterationSpecificSearchPattern); if (iterationSpecificEntires.length == CrawlEnvironment.PR_NUMSLAVES) { LOG.info("Iteration Number:" + iterationId + " has the proper number of results"); if (lastValidIterationNo == -1 || lastValidIterationNo < iterationId) { // set this iteration as the valid iteration lastValidIterationNo = iterationId; LOG.info("Setting Iteration:" + iterationId + " as last valid iteration number"); // clear candidate list iterationSpecificValues.clear(); // add paths to candidate list ... for (FileStatus iterationSpecificEntry : iterationSpecificEntires) { iterationSpecificValues.add(iterationSpecificEntry.getPath()); } } } else { LOG.error("Skipping Iteration Number:" + iterationId + ". It only has:" + iterationSpecificEntires.length + "results"); } } catch (ParseException e) { LOG.error(CCStringUtils.stringifyException(e)); } } return lastValidIterationNo; }
From source file:org.commoncrawl.service.pagerank.master.PageRankMaster.java
License:Open Source License
private void loadState() throws IOException { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); // allocate a new server state ... _serverState = new PRMasterState(); // initially in idle state _serverState.setServerStatus(PRMasterState.ServerStatus.IDLE); // paths //from www .j a va 2 s. c o m Path valuesFilePath = new Path("crawl/pageRank/seed/" + _jobId + "/values"); Path edgesFilePath = new Path("crawl/pageRank/seed/" + _jobId + "/edges"); // figure out number of values int itemCount = fs.globStatus(new Path(valuesFilePath, "value_*")).length; LOG.info("There are:" + itemCount + " values for job:" + _jobId); Path jobsPath = new Path("crawl/pageRank/jobs/" + _jobId); fs.mkdirs(jobsPath); // find last valid iteration int lastValidIteration = findListValidIteration(fs, jobsPath); LOG.info("Last Valid Iteration for Job:" + _jobId + " is:" + lastValidIteration); int nextIteration = lastValidIteration + 1; // clear all check point and distribution files clearAllCheckpointAndDistributionFiles(fs, jobsPath); PageRankJobConfig jobConfig = new PageRankJobConfig(); jobConfig.setJobId(_jobId); jobConfig.setIterationNumber(nextIteration); jobConfig.setMaxIterationNumber(1000); jobConfig.setSlaveCount(itemCount); jobConfig.setInputValuesPath(valuesFilePath.toString()); jobConfig.setOutlinksDataPath(edgesFilePath.toString()); // set up job dir ... Path jobPath = new Path(_hdfsWorkingDir, Long.toString(jobConfig.getJobId())); jobConfig.setJobWorkPath(jobPath.toString()); jobConfig.setAlgorithmId(0); jobConfig.setAlpha(.85f); _jobConfig = jobConfig; // update server state ... _serverState.setActiveJobConfig(_jobConfig); _serverState.setFieldDirty(PRMasterState.Field_ACTIVEJOBCONFIG); _serverState.setServerStatus(PRMasterState.ServerStatus.STARTED); }
From source file:org.commoncrawl.service.queryserver.master.MasterServer.java
License:Open Source License
private Path locateQueryDBPath() throws IOException { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); FileStatus statusArray[] = fs.globStatus(new Path("crawl/querydb/db/*")); Path candidatePath = null;/* ww w . j a v a 2 s .c om*/ for (FileStatus fileStatus : statusArray) { if (candidatePath == null) { candidatePath = fileStatus.getPath(); } else { long prevTimestamp = Long.parseLong(candidatePath.getName()); long currentTimestamp = Long.parseLong(fileStatus.getPath().getName()); if (currentTimestamp > prevTimestamp) { candidatePath = fileStatus.getPath(); } } } if (candidatePath != null) { LOG.info("Selected Candidate Path:" + candidatePath); } return candidatePath; }
From source file:org.commoncrawl.service.queryserver.master.QueryServerFE.java
License:Open Source License
private static long findLatestDatabaseTimestamp(Path rootPath) throws IOException { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); FileStatus candidates[] = fs.globStatus(new Path(rootPath, "*")); long candidateTimestamp = -1L; for (FileStatus candidate : candidates) { LOG.info("Found Seed Candidate:" + candidate.getPath()); long timestamp = Long.parseLong(candidate.getPath().getName()); if (candidateTimestamp == -1 || candidateTimestamp < timestamp) { candidateTimestamp = timestamp; }//from w ww . ja va 2s. co m } LOG.info("Selected Candidate is:" + candidateTimestamp); return candidateTimestamp; }
From source file:org.commoncrawl.service.queryserver.query.DomainListQuery.java
License:Open Source License
@Override protected long executeLocal(FileSystem remoteFileSystem, Configuration conf, DatabaseIndexV2.MasterDatabaseIndex index, EventLoop eventLoop, File tempFirDir, QueryRequest<DomainListQueryInfo, Text, SubDomainMetadata> requestObject) throws IOException { Path mergeResultsPath = new Path( getLocalQueryResultsPathPrefix(requestObject) + getMergedResultsFileName()); LOG.info("Execute Local called for Query:" + getQueryId() + " MergeResultsPath is:" + mergeResultsPath); // get a local file system object FileSystem localFileSystem = FileSystem.getLocal(conf); //LOG.info("Executing LocalQuery - checking if MergedFile:" + mergeResultsPath + " Exists"); // if source merged results path does not exist ... if (!localFileSystem.exists(mergeResultsPath)) { LOG.info("Execute Local for Query:" + getQueryId() + " Source MergeFile:" + mergeResultsPath + " Not Found. Checking for parts files"); // collect parts ... Vector<Path> parts = new Vector<Path>(); FileStatus fileStatusArray[] = remoteFileSystem .globStatus(new Path(getHDFSQueryResultsPath(), "part-*")); if (fileStatusArray.length == 0) { LOG.error("Execute Local for Query:" + getQueryId() + " FAILED. No Parts Files Found!"); throw new IOException("Remote Component Part Files Not Found"); }/*www .j a v a 2s .c o m*/ for (FileStatus part : fileStatusArray) { //LOG.info("Found Part:"+ part); parts.add(part.getPath()); } LOG.info("Execute Local for Query:" + getQueryId() + " Initializing Merger"); SequenceFileSpillWriter<Text, SubDomainMetadata> mergedFileSpillWriter = new SequenceFileSpillWriter<Text, SubDomainMetadata>( localFileSystem, conf, mergeResultsPath, Text.class, SubDomainMetadata.class, new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem, PositionBasedSequenceFileIndex.getIndexNameFromBaseName(mergeResultsPath)), false); try { SequenceFileMerger<Text, SubDomainMetadata> merger = new SequenceFileMerger<Text, SubDomainMetadata>( remoteFileSystem, conf, parts, mergedFileSpillWriter, Text.class, SubDomainMetadata.class, new RawKeyValueComparator<Text, SubDomainMetadata>() { DataInputBuffer key1Stream = new DataInputBuffer(); DataInputBuffer key2Stream = new DataInputBuffer(); @Override public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data, int value1Offset, int value1Length, byte[] value2Data, int value2Offset, int value2Length) throws IOException { key1Stream.reset(key1Data, key1Offset, key1Length); key2Stream.reset(key2Data, key2Offset, key2Length); WritableUtils.readVInt(key1Stream); WritableUtils.readVInt(key2Stream); return BytesWritable.Comparator.compareBytes(key1Data, key1Stream.getPosition(), key1Length - key1Stream.getPosition(), key2Data, key2Stream.getPosition(), key2Length - key2Stream.getPosition()); } @Override public int compare(Text key1, SubDomainMetadata value1, Text key2, SubDomainMetadata value2) { return key1.compareTo(key2); } }); try { LOG.info("Execute Local for Query:" + getQueryId() + " Running Merger"); merger.mergeAndSpill(null); LOG.info("Execute Local for Query:" + getQueryId() + " Merge Successfull.. Deleting Merge Inputs"); for (Path inputPath : parts) { remoteFileSystem.delete(inputPath, false); } } catch (IOException e) { LOG.error("Execute Local for Query:" + getQueryId() + " Merge Failed with Exception:" + CCStringUtils.stringifyException(e)); throw e; } finally { LOG.info("** CLOSING MERGER"); merger.close(); } } finally { LOG.info("** FLUSHING SPILLWRITER"); mergedFileSpillWriter.close(); } } // now check for query specific merge file ... Path queryResultsPath = new Path(getLocalQueryResultsPathPrefix(requestObject) + getOutputFileNameBasedOnSortByField(requestObject.getClientQueryInfo().getSortByField())); LOG.info("Execute Local for Query:" + getQueryId() + " Checking for QueryResultsPath:" + queryResultsPath); if (!localFileSystem.exists(queryResultsPath)) { LOG.info("Exectue Local for Query:" + getQueryId() + " Results File:" + queryResultsPath + " does not exist. Running sort and merge process"); LOG.info("Execute Local for Query:" + getQueryId() + " Allocating SpillWriter with output to:" + queryResultsPath); // allocate a spill writer ... SequenceFileSpillWriter<Text, SubDomainMetadata> sortedResultsFileSpillWriter = new SequenceFileSpillWriter<Text, SubDomainMetadata>( localFileSystem, conf, queryResultsPath, Text.class, SubDomainMetadata.class, new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem, PositionBasedSequenceFileIndex.getIndexNameFromBaseName(queryResultsPath)), false); try { LOG.info("Execute Local for Query:" + getQueryId() + " Allocating MergeSortSpillWriter"); // and connect it to the merge spill writer ... MergeSortSpillWriter<Text, SubDomainMetadata> mergeSortSpillWriter = new MergeSortSpillWriter<Text, SubDomainMetadata>( conf, sortedResultsFileSpillWriter, localFileSystem, new Path(tempFirDir.getAbsolutePath()), /* new RawKeyValueComparator<Text,SubDomainMetadata>() { SubDomainMetadata value1 = new SubDomainMetadata(); SubDomainMetadata value2 = new SubDomainMetadata(); @Override public int compare(Text key1, SubDomainMetadata value1, Text key2,SubDomainMetadata value2) { return value1.getUrlCount() - value2.getUrlCount(); } @Override public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data, int value1Offset, int value1Length, byte[] value2Data, int value2Offset, int value2Length) throws IOException { value1.clear(); value2.clear(); value1.readFields(new DataInputStream(new ByteArrayInputStream(value1Data,value1Offset,value1Length))); value2.readFields(new DataInputStream(new ByteArrayInputStream(value2Data,value2Offset,value2Length))); return compare(null, value1, null, value2); } }, */ new OptimizedKeyGeneratorAndComparator<Text, SubDomainMetadata>() { @Override public void generateOptimizedKeyForPair(Text key, SubDomainMetadata value, org.commoncrawl.hadoop.mergeutils.OptimizedKeyGeneratorAndComparator.OptimizedKey optimizedKeyOut) throws IOException { optimizedKeyOut.setLongKeyValue(value.getUrlCount()); } @Override public int getGeneratedKeyType() { return OptimizedKey.KEY_TYPE_LONG; } }, Text.class, SubDomainMetadata.class, false, null); try { // create a vector representing the single input segment Vector<Path> singleInputSegment = new Vector<Path>(); LOG.info("Execute Local for Query:" + getQueryId() + " Adding MergeResultsPath:" + mergeResultsPath + " as input for Merger"); singleInputSegment.add(mergeResultsPath); // create a SequenceFileReader SequenceFileReader<Text, SubDomainMetadata> mergeSegmentReader = new SequenceFileReader<Text, SubDomainMetadata>( localFileSystem, conf, singleInputSegment, mergeSortSpillWriter, Text.class, SubDomainMetadata.class); try { LOG.info("Execute Local for Query:" + getQueryId() + " calling readAndSpill"); mergeSegmentReader.readAndSpill(); LOG.info("Execute Local for Query:" + getQueryId() + " readAndSpill finished"); } finally { if (mergeSegmentReader != null) { mergeSegmentReader.close(); } } } finally { if (mergeSortSpillWriter != null) { mergeSortSpillWriter.close(); } } } finally { if (sortedResultsFileSpillWriter != null) { sortedResultsFileSpillWriter.close(); } } } //LOG.info("Allocating SequenceFileIndex object for DomainListQuery Id:" + getQueryId() + " with Path:" + queryResultsPath); PositionBasedSequenceFileIndex<Text, SubDomainMetadata> indexFile = new PositionBasedSequenceFileIndex<Text, SubDomainMetadata>( localFileSystem, queryResultsPath, Text.class, SubDomainMetadata.class); //LOG.info("SequenceFileIndex object for DomainListQuery Id:" + getQueryId() + " with Path:" + queryResultsPath + " returned record count:" + indexFile.getRecordCount()); return indexFile.getRecordCount(); }
From source file:org.commoncrawl.service.queryserver.query.DomainListQuery.java
License:Open Source License
@Override public boolean requiresRemoteDispatch(FileSystem fileSystem, Configuration conf, ShardMapper shardMapper, QueryRequest<DomainListQueryInfo, Text, SubDomainMetadata> theClientRequest, ArrayList<ShardIndexHostNameTuple> shardIdToHostNameMapping) throws IOException { // get shard mappings for index ... shardIdToHostNameMapping.addAll(shardMapper .mapShardIdsForIndex(DatabaseIndexV2.MasterDatabaseIndex.INDEX_NAME_DOMAIN_NAME_TO_METADATA)); // create a set representing the collection of parts required to complete this query ... Set<String> requiredParts = new HashSet<String>(); for (ShardIndexHostNameTuple tuple : shardIdToHostNameMapping) { requiredParts.add(getPartNameForSlave(tuple.getShardId())); }//from ww w. j a v a2 s.c om // now iterate parts available on hdfs ... Path remoteQueryPath = getHDFSQueryResultsPath(); //LOG.info("Results Path is:" + remoteQueryPath); FileStatus availableParts[] = fileSystem.globStatus(new Path(remoteQueryPath, "part-*")); for (FileStatus part : availableParts) { //LOG.info("Found Path:" + part.getPath()); requiredParts.remove(part.getPath().getName()); } // now check to see if all parts are available if (requiredParts.size() != 0) { for (String part : requiredParts) { LOG.info("Required remote part:" + part + " NOT available yet."); } return true; } else { LOG.info("All parts required for query available."); return false; } }
From source file:org.commoncrawl.util.CrawlLogSplitter.java
License:Open Source License
public static void main(String[] args) throws IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FileStatus arcFiles[] = fs.globStatus(new Path("crawl/checkpoint_data/CrawlLog_*")); for (FileStatus candidate : arcFiles) { if (candidate.getLen() > SPLIT_SIZE) { candidateList.add(candidate.getPath()); }/*from w w w . j a va 2 s . co m*/ } LOG.info("Found:" + candidateList.size() + " oversized candidates"); Path tempOutputDir = new Path(conf.get("mapred.temp.dir", ".")); while (candidateList.size() != 0) { Path candidateName = candidateList.first(); candidateList.remove(candidateName); LOG.info("Processing Candidate:" + candidateName); long fileSize = fs.getFileStatus(candidateName).getLen(); //get crawl log filename components ArrayList<Path> splitItems = new ArrayList<Path>(); int index = 0; Path outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir, candidateName.getName(), index); LOG.info("Initial Output Path is:" + outputPart); fs.delete(outputPart, false); // create reader SequenceFile.Reader reader = new SequenceFile.Reader(fs, candidateName, conf); ValueBytes sourceVB = reader.createValueBytes(); DataOutputBuffer sourceKeyData = new DataOutputBuffer(); try { // ok create temp file SequenceFile.Writer activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class, CrawlURL.class, CompressionType.BLOCK, new SnappyCodec()); // add to split items array splitItems.add(outputPart); try { long recordsWritten = 0; while (reader.nextRawKey(sourceKeyData) != -1) { reader.nextRawValue(sourceVB); long lengthPreWrite = activeWriter.getLength(); activeWriter.appendRaw(sourceKeyData.getData(), 0, sourceKeyData.getLength(), sourceVB); if (++recordsWritten % 10000 == 0) { LOG.info("Write 10000 records"); } long lengthPostWrite = activeWriter.getLength(); if (lengthPostWrite != lengthPreWrite) { if (lengthPostWrite >= IDEAL_SIZE) { LOG.info("Hit Split Point. Flushing File:" + outputPart); activeWriter.close(); outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir, candidateName.getName(), ++index); LOG.info("Creating New File:" + outputPart); activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class, CrawlURL.class, CompressionType.BLOCK, new SnappyCodec()); splitItems.add(outputPart); } } sourceKeyData.reset(); } } finally { activeWriter.close(); } } finally { reader.close(); } LOG.info("Rewrote Source:" + candidateName + " into:" + splitItems.size() + " split files"); for (Path splitItem : splitItems) { Path destPath = new Path("crawl/checkpoint_data", splitItem.getName()); LOG.info("Moving:" + splitItem + " to:" + destPath); fs.rename(splitItem, destPath); } Path sourceMoveLocation = new Path("crawl/checkpoint_data_split", candidateName.getName()); LOG.info("Moving SOURCE:" + candidateName + " to:" + sourceMoveLocation); fs.rename(candidateName, sourceMoveLocation); } }
From source file:org.commoncrawl.util.HDFSUtils.java
License:Open Source License
public static long findLatestDatabaseTimestamp(FileSystem fs, Path rootPath) throws IOException { FileStatus candidates[] = fs.globStatus(new Path(rootPath, "*")); long candidateTimestamp = -1L; for (FileStatus candidate : candidates) { LOG.info("Found Seed Candidate:" + candidate.getPath()); try {/*from w w w . j a v a 2s. co m*/ long timestamp = Long.parseLong(candidate.getPath().getName()); if (candidateTimestamp == -1 || candidateTimestamp < timestamp) { candidateTimestamp = timestamp; } } catch (Exception e) { LOG.error("Invalid Path:" + candidate.getPath()); } } LOG.info("Selected Candidate is:" + candidateTimestamp); return candidateTimestamp; }
From source file:org.commoncrawl.util.NodeAffinityMaskBuilder.java
License:Open Source License
public static String buildNodeAffinityMask(FileSystem fileSystem, Path partFileDirectory, Map<Integer, String> optionalRootMapHint, Set<String> excludedNodeList, int maxReducersPerNode, boolean skipBalance) throws IOException { TreeMap<Integer, String> partitionToNodeMap = new TreeMap<Integer, String>(); FileStatus paths[] = fileSystem.globStatus(new Path(partFileDirectory, "part-*")); if (paths.length == 0) { throw new IOException("Invalid source Path:" + partFileDirectory); }//from w ww . j a va 2s . c o m Multimap<String, Integer> inverseMap = TreeMultimap.create(); Map<Integer, List<String>> paritionToDesiredCandidateList = new TreeMap<Integer, List<String>>(); // iterate paths for (FileStatus path : paths) { String currentFile = path.getPath().getName(); int partitionNumber; try { if (currentFile.startsWith("part-r")) { partitionNumber = NUMBER_FORMAT.parse(currentFile.substring("part-r-".length())).intValue(); } else { partitionNumber = NUMBER_FORMAT.parse(currentFile.substring("part-".length())).intValue(); } } catch (ParseException e) { throw new IOException("Invalid Part Name Encountered:" + currentFile); } // get block locations BlockLocation locations[] = fileSystem.getFileBlockLocations(path, 0, path.getLen()); // if passed in root map is not null, then validate that all blocks for the current file reside on the desired node if (optionalRootMapHint != null) { // the host all blocks should reside on String desiredHost = optionalRootMapHint.get(partitionNumber); ArrayList<String> misplacedBlocks = new ArrayList<String>(); // ok walk all blocks for (BlockLocation location : locations) { boolean found = false; for (String host : location.getHosts()) { if (host.compareTo(desiredHost) == 0) { found = true; break; } } if (!found) { misplacedBlocks.add("Block At:" + location.getOffset() + " for File:" + path.getPath() + " did not contain desired location:" + desiredHost); } } // ok pass test at a certain threshold if (misplacedBlocks.size() != 0 && ((float) misplacedBlocks.size() / (float) locations.length) > .50f) { LOG.error("Misplaced Blocks Exceed Threshold"); for (String misplacedBlock : misplacedBlocks) { LOG.error(misplacedBlock); } // TODO: SKIP THIS STEP FOR NOW ??? //throw new IOException("Misplaced Blocks Exceed Threshold!"); } partitionToNodeMap.put(partitionNumber, desiredHost); } else { if (excludedNodeList != null) { // LOG.info("Exclued Node List is:" + Lists.newArrayList(excludedNodeList).toString()); } // ok ask file system for block locations TreeMap<String, Integer> nodeToBlockCount = new TreeMap<String, Integer>(); for (BlockLocation location : locations) { for (String host : location.getHosts()) { if (excludedNodeList == null || !excludedNodeList.contains(host)) { Integer nodeHitCount = nodeToBlockCount.get(host); if (nodeHitCount == null) { nodeToBlockCount.put(host, 1); } else { nodeToBlockCount.put(host, nodeHitCount.intValue() + 1); } } } } if (nodeToBlockCount.size() == 0) { throw new IOException("No valid nodes found for partition number:" + path); } Map.Entry<String, Integer> entries[] = nodeToBlockCount.entrySet().toArray(new Map.Entry[0]); Arrays.sort(entries, new Comparator<Map.Entry<String, Integer>>() { @Override public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) { return o1.getValue().intValue() < o2.getValue().intValue() ? 1 : o1.getValue().intValue() == o2.getValue().intValue() ? 0 : -1; } }); // build a list of nodes by priority ... List<String> nodesByPriority = Lists.transform(Lists.newArrayList(entries), new Function<Map.Entry<String, Integer>, String>() { @Override public String apply(Entry<String, Integer> entry) { return entry.getKey(); } }); // stash it away ... paritionToDesiredCandidateList.put(partitionNumber, nodesByPriority); //LOG.info("Mapping Partition:" + partitionNumber + " To Node:" + entries[0].getKey() + " BlockCount" + entries[0].getValue().intValue()); partitionToNodeMap.put(partitionNumber, entries[0].getKey()); // store the inverse mapping ... inverseMap.put(entries[0].getKey(), partitionNumber); } } if (skipBalance) { // walk partition map to make sure everything is assigned ... /* for (String node : inverseMap.keys()) { if (inverseMap.get(node).size() > maxReducersPerNode) { throw new IOException("Node:" + node + " has too many partitions! ("+inverseMap.get(node).size()); } } */ } // now if optional root map hint is null if (optionalRootMapHint == null && !skipBalance) { // figure out if there is an imbalance int avgRegionsPerNode = (int) Math.floor((float) paths.length / (float) inverseMap.keySet().size()); int maxRegionsPerNode = (int) Math.ceil((float) paths.length / (float) inverseMap.keySet().size()); LOG.info("Attempting to ideally balance nodes. Avg paritions per node:" + avgRegionsPerNode); // two passes .. for (int pass = 0; pass < 2; ++pass) { LOG.info("Pass:" + pass); // iterate nodes ... for (String node : ImmutableSet.copyOf(inverseMap.keySet())) { // get paritions in map Collection<Integer> paritions = ImmutableList.copyOf(inverseMap.get(node)); // if parition count exceeds desired average ... if (paritions.size() > maxRegionsPerNode) { // first pass, assign based on preference if (pass == 0) { LOG.info("Node:" + node + " parition count:" + paritions.size() + " exceeds avg:" + avgRegionsPerNode); // walk partitions trying to find a node to discrard the parition to for (int partition : paritions) { for (String candidate : paritionToDesiredCandidateList.get(partition)) { if (!candidate.equals(node)) { // see if this candidate has room .. if (inverseMap.get(candidate).size() < avgRegionsPerNode) { LOG.info("REASSIGNING parition:" + partition + " from Node:" + node + " to Node:" + candidate); // found match reassign it ... inverseMap.remove(node, partition); inverseMap.put(candidate, partition); break; } } } // break out if reach our desired number of paritions for this node if (inverseMap.get(node).size() == avgRegionsPerNode) break; } } // second pass ... assign based on least loaded node ... else { int desiredRelocations = paritions.size() - maxRegionsPerNode; LOG.info("Desired Relocation for node:" + node + ":" + desiredRelocations + " partitions:" + paritions.size()); for (int i = 0; i < desiredRelocations; ++i) { String leastLoadedNode = null; int leastLoadedNodePartitionCount = 0; for (String candidateNode : inverseMap.keySet()) { if (leastLoadedNode == null || inverseMap.get(candidateNode) .size() < leastLoadedNodePartitionCount) { leastLoadedNode = candidateNode; leastLoadedNodePartitionCount = inverseMap.get(candidateNode).size(); } } int bestPartition = -1; int bestParitionOffset = -1; for (int candidateParition : inverseMap.get(node)) { int offset = 0; for (String nodeCandidate : paritionToDesiredCandidateList .get(candidateParition)) { if (nodeCandidate.equals(leastLoadedNode)) { if (bestPartition == -1 || bestParitionOffset > offset) { bestPartition = candidateParition; bestParitionOffset = offset; } break; } offset++; } } if (bestPartition == -1) { bestPartition = Iterables.get(inverseMap.get(node), 0); } LOG.info("REASSIGNING parition:" + bestPartition + " from Node:" + node + " to Node:" + leastLoadedNode); // found match reassign it ... inverseMap.remove(node, bestPartition); inverseMap.put(leastLoadedNode, bestPartition); } } } } } LOG.info("Rebuilding parition to node map based on ideal balance"); for (String node : inverseMap.keySet()) { LOG.info("Node:" + node + " has:" + inverseMap.get(node).size() + " partitions:" + inverseMap.get(node).toString()); } partitionToNodeMap.clear(); for (Map.Entry<String, Integer> entry : inverseMap.entries()) { partitionToNodeMap.put(entry.getValue(), entry.getKey()); } } StringBuilder builder = new StringBuilder(); int itemCount = 0; for (Map.Entry<Integer, String> entry : partitionToNodeMap.entrySet()) { if (itemCount++ != 0) builder.append("\t"); builder.append(entry.getKey().intValue() + "," + entry.getValue()); } return builder.toString(); }
From source file:org.exem.flamingo.shared.util.HdfsUtils.java
License:Apache License
/** * ? ? ??.//from w w w .j a va 2 s . c om * * @param source ?? * @param target ?? * @param fs Hadoop FileSystem */ public static void move(String source, String target, FileSystem fs) throws Exception { Path srcPath = new Path(source); Path[] srcs = FileUtil.stat2Paths(fs.globStatus(srcPath), srcPath); Path dst = new Path(target); if (srcs.length > 1 && !fs.getFileStatus(dst).isDirectory()) { throw new ServiceException("When moving multiple files, destination should be a directory."); } for (int i = 0; i < srcs.length; i++) { if (!fs.rename(srcs[i], dst)) { FileStatus srcFstatus = null; FileStatus dstFstatus = null; try { srcFstatus = fs.getFileStatus(srcs[i]); } catch (FileNotFoundException e) { throw new FileNotFoundException(srcs[i] + ": No such file or directory"); } try { dstFstatus = fs.getFileStatus(dst); } catch (IOException e) { // Nothing } if ((srcFstatus != null) && (dstFstatus != null)) { if (srcFstatus.isDirectory() && !dstFstatus.isDirectory()) { throw new ServiceException( "cannot overwrite non directory " + dst + " with directory " + srcs[i]); } } throw new ServiceException("Failed to rename " + srcs[i] + " to " + dst); } } }