Example usage for org.apache.hadoop.fs FileSystem globStatus

List of usage examples for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException 

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:org.commoncrawl.service.pagerank.master.PageRankMaster.java

License:Open Source License

private int findListValidIteration(FileSystem fs, Path jobDataPath) throws IOException {
    // scan job directory for best value candidate 
    Path valueSearchPattern = new Path(jobDataPath, "value_*-00000");

    FileStatus candidates[] = fs.globStatus(valueSearchPattern);

    int lastValidIterationNo = -1;

    ArrayList<Path> iterationSpecificValues = new ArrayList<Path>();

    for (FileStatus candidate : candidates) {
        // extract iteration portion of name 
        String iterationStr = candidate.getPath().getName().substring("value_".length(), "value_".length() + 5);
        // parse 
        try {/*from  w  w  w  .  j  a  va2 s  . c  o m*/
            int iterationId = NUMBER_FORMAT.parse(iterationStr).intValue();
            // now see if we up to PR_NUM_SLAVES values 
            Path iterationSpecificSearchPattern = new Path(jobDataPath, "value_" + iterationStr + "-*");
            // count result 
            FileStatus iterationSpecificEntires[] = fs.globStatus(iterationSpecificSearchPattern);

            if (iterationSpecificEntires.length == CrawlEnvironment.PR_NUMSLAVES) {
                LOG.info("Iteration Number:" + iterationId + " has the proper number of results");

                if (lastValidIterationNo == -1 || lastValidIterationNo < iterationId) {
                    // set this iteration as the valid iteration
                    lastValidIterationNo = iterationId;
                    LOG.info("Setting Iteration:" + iterationId + " as last valid iteration number");
                    // clear candidate list 
                    iterationSpecificValues.clear();
                    // add paths to candidate list ... 
                    for (FileStatus iterationSpecificEntry : iterationSpecificEntires) {
                        iterationSpecificValues.add(iterationSpecificEntry.getPath());
                    }
                }
            } else {
                LOG.error("Skipping Iteration Number:" + iterationId + ". It only has:"
                        + iterationSpecificEntires.length + "results");
            }

        } catch (ParseException e) {
            LOG.error(CCStringUtils.stringifyException(e));
        }
    }
    return lastValidIterationNo;
}

From source file:org.commoncrawl.service.pagerank.master.PageRankMaster.java

License:Open Source License

private void loadState() throws IOException {
    FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

    // allocate a new server state ... 
    _serverState = new PRMasterState();
    // initially in idle state 
    _serverState.setServerStatus(PRMasterState.ServerStatus.IDLE);
    // paths //from www .j a va  2  s. c  o m
    Path valuesFilePath = new Path("crawl/pageRank/seed/" + _jobId + "/values");
    Path edgesFilePath = new Path("crawl/pageRank/seed/" + _jobId + "/edges");
    // figure out number of values
    int itemCount = fs.globStatus(new Path(valuesFilePath, "value_*")).length;
    LOG.info("There are:" + itemCount + " values for job:" + _jobId);
    Path jobsPath = new Path("crawl/pageRank/jobs/" + _jobId);
    fs.mkdirs(jobsPath);
    // find last valid iteration 
    int lastValidIteration = findListValidIteration(fs, jobsPath);
    LOG.info("Last Valid Iteration for Job:" + _jobId + " is:" + lastValidIteration);
    int nextIteration = lastValidIteration + 1;
    // clear all check point and distribution files 
    clearAllCheckpointAndDistributionFiles(fs, jobsPath);

    PageRankJobConfig jobConfig = new PageRankJobConfig();

    jobConfig.setJobId(_jobId);
    jobConfig.setIterationNumber(nextIteration);
    jobConfig.setMaxIterationNumber(1000);
    jobConfig.setSlaveCount(itemCount);
    jobConfig.setInputValuesPath(valuesFilePath.toString());
    jobConfig.setOutlinksDataPath(edgesFilePath.toString());
    // set up job dir ... 
    Path jobPath = new Path(_hdfsWorkingDir, Long.toString(jobConfig.getJobId()));
    jobConfig.setJobWorkPath(jobPath.toString());
    jobConfig.setAlgorithmId(0);
    jobConfig.setAlpha(.85f);

    _jobConfig = jobConfig;

    // update server state ...
    _serverState.setActiveJobConfig(_jobConfig);
    _serverState.setFieldDirty(PRMasterState.Field_ACTIVEJOBCONFIG);
    _serverState.setServerStatus(PRMasterState.ServerStatus.STARTED);

}

From source file:org.commoncrawl.service.queryserver.master.MasterServer.java

License:Open Source License

private Path locateQueryDBPath() throws IOException {
    FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

    FileStatus statusArray[] = fs.globStatus(new Path("crawl/querydb/db/*"));

    Path candidatePath = null;/*  ww w  . j a  v  a  2 s .c  om*/
    for (FileStatus fileStatus : statusArray) {
        if (candidatePath == null) {
            candidatePath = fileStatus.getPath();
        } else {
            long prevTimestamp = Long.parseLong(candidatePath.getName());
            long currentTimestamp = Long.parseLong(fileStatus.getPath().getName());
            if (currentTimestamp > prevTimestamp) {
                candidatePath = fileStatus.getPath();
            }
        }
    }
    if (candidatePath != null) {
        LOG.info("Selected Candidate Path:" + candidatePath);
    }
    return candidatePath;
}

From source file:org.commoncrawl.service.queryserver.master.QueryServerFE.java

License:Open Source License

private static long findLatestDatabaseTimestamp(Path rootPath) throws IOException {
    FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

    FileStatus candidates[] = fs.globStatus(new Path(rootPath, "*"));

    long candidateTimestamp = -1L;

    for (FileStatus candidate : candidates) {
        LOG.info("Found Seed Candidate:" + candidate.getPath());
        long timestamp = Long.parseLong(candidate.getPath().getName());
        if (candidateTimestamp == -1 || candidateTimestamp < timestamp) {
            candidateTimestamp = timestamp;
        }//from  w  ww  . ja  va  2s. co m
    }
    LOG.info("Selected Candidate is:" + candidateTimestamp);
    return candidateTimestamp;
}

From source file:org.commoncrawl.service.queryserver.query.DomainListQuery.java

License:Open Source License

@Override
protected long executeLocal(FileSystem remoteFileSystem, Configuration conf,
        DatabaseIndexV2.MasterDatabaseIndex index, EventLoop eventLoop, File tempFirDir,
        QueryRequest<DomainListQueryInfo, Text, SubDomainMetadata> requestObject) throws IOException {

    Path mergeResultsPath = new Path(
            getLocalQueryResultsPathPrefix(requestObject) + getMergedResultsFileName());

    LOG.info("Execute Local called for Query:" + getQueryId() + " MergeResultsPath is:" + mergeResultsPath);

    // get a local file system object
    FileSystem localFileSystem = FileSystem.getLocal(conf);

    //LOG.info("Executing LocalQuery - checking if MergedFile:" + mergeResultsPath + " Exists");
    // if source merged results path does not exist ... 
    if (!localFileSystem.exists(mergeResultsPath)) {
        LOG.info("Execute Local for Query:" + getQueryId() + " Source MergeFile:" + mergeResultsPath
                + " Not Found. Checking for parts files");
        // collect parts ...
        Vector<Path> parts = new Vector<Path>();

        FileStatus fileStatusArray[] = remoteFileSystem
                .globStatus(new Path(getHDFSQueryResultsPath(), "part-*"));

        if (fileStatusArray.length == 0) {
            LOG.error("Execute Local for Query:" + getQueryId() + " FAILED. No Parts Files Found!");
            throw new IOException("Remote Component Part Files Not Found");
        }/*www .j  a v a 2s  .c o  m*/

        for (FileStatus part : fileStatusArray) {
            //LOG.info("Found Part:"+ part);
            parts.add(part.getPath());
        }

        LOG.info("Execute Local for Query:" + getQueryId() + " Initializing Merger");
        SequenceFileSpillWriter<Text, SubDomainMetadata> mergedFileSpillWriter = new SequenceFileSpillWriter<Text, SubDomainMetadata>(
                localFileSystem, conf, mergeResultsPath, Text.class, SubDomainMetadata.class,
                new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem,
                        PositionBasedSequenceFileIndex.getIndexNameFromBaseName(mergeResultsPath)),
                false);

        try {
            SequenceFileMerger<Text, SubDomainMetadata> merger = new SequenceFileMerger<Text, SubDomainMetadata>(
                    remoteFileSystem, conf, parts, mergedFileSpillWriter, Text.class, SubDomainMetadata.class,

                    new RawKeyValueComparator<Text, SubDomainMetadata>() {

                        DataInputBuffer key1Stream = new DataInputBuffer();
                        DataInputBuffer key2Stream = new DataInputBuffer();

                        @Override
                        public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data,
                                int key2Offset, int key2Length, byte[] value1Data, int value1Offset,
                                int value1Length, byte[] value2Data, int value2Offset, int value2Length)
                                throws IOException {

                            key1Stream.reset(key1Data, key1Offset, key1Length);
                            key2Stream.reset(key2Data, key2Offset, key2Length);

                            WritableUtils.readVInt(key1Stream);
                            WritableUtils.readVInt(key2Stream);

                            return BytesWritable.Comparator.compareBytes(key1Data, key1Stream.getPosition(),
                                    key1Length - key1Stream.getPosition(), key2Data, key2Stream.getPosition(),
                                    key2Length - key2Stream.getPosition());
                        }

                        @Override
                        public int compare(Text key1, SubDomainMetadata value1, Text key2,
                                SubDomainMetadata value2) {
                            return key1.compareTo(key2);
                        }

                    });

            try {
                LOG.info("Execute Local for Query:" + getQueryId() + " Running Merger");
                merger.mergeAndSpill(null);
                LOG.info("Execute Local for Query:" + getQueryId()
                        + " Merge Successfull.. Deleting Merge Inputs");
                for (Path inputPath : parts) {
                    remoteFileSystem.delete(inputPath, false);
                }
            } catch (IOException e) {
                LOG.error("Execute Local for Query:" + getQueryId() + " Merge Failed with Exception:"
                        + CCStringUtils.stringifyException(e));
                throw e;
            } finally {
                LOG.info("** CLOSING MERGER");
                merger.close();
            }
        } finally {
            LOG.info("** FLUSHING SPILLWRITER");
            mergedFileSpillWriter.close();
        }
    }

    // now check for query specific merge file ...
    Path queryResultsPath = new Path(getLocalQueryResultsPathPrefix(requestObject)
            + getOutputFileNameBasedOnSortByField(requestObject.getClientQueryInfo().getSortByField()));

    LOG.info("Execute Local for Query:" + getQueryId() + " Checking for QueryResultsPath:" + queryResultsPath);

    if (!localFileSystem.exists(queryResultsPath)) {

        LOG.info("Exectue Local for Query:" + getQueryId() + " Results File:" + queryResultsPath
                + " does not exist. Running sort and merge process");

        LOG.info("Execute Local for Query:" + getQueryId() + " Allocating SpillWriter with output to:"
                + queryResultsPath);
        // allocate a spill writer ...  
        SequenceFileSpillWriter<Text, SubDomainMetadata> sortedResultsFileSpillWriter = new SequenceFileSpillWriter<Text, SubDomainMetadata>(
                localFileSystem, conf, queryResultsPath, Text.class, SubDomainMetadata.class,
                new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem,
                        PositionBasedSequenceFileIndex.getIndexNameFromBaseName(queryResultsPath)),
                false);

        try {

            LOG.info("Execute Local for Query:" + getQueryId() + " Allocating MergeSortSpillWriter");
            // and connect it to the merge spill writer ...
            MergeSortSpillWriter<Text, SubDomainMetadata> mergeSortSpillWriter = new MergeSortSpillWriter<Text, SubDomainMetadata>(
                    conf, sortedResultsFileSpillWriter, localFileSystem, new Path(tempFirDir.getAbsolutePath()),
                    /*
                    new RawKeyValueComparator<Text,SubDomainMetadata>() {
                            
                      SubDomainMetadata value1 = new SubDomainMetadata();
                      SubDomainMetadata value2 = new SubDomainMetadata();
                              
                            
                      @Override
                      public int compare(Text key1, SubDomainMetadata value1, Text key2,SubDomainMetadata value2) {
                        return value1.getUrlCount() - value2.getUrlCount();
                      }
                            
                      @Override
                      public int compareRaw(byte[] key1Data, int key1Offset,
                          int key1Length, byte[] key2Data, int key2Offset,
                          int key2Length, byte[] value1Data, int value1Offset,
                          int value1Length, byte[] value2Data, int value2Offset,
                          int value2Length) throws IOException {
                            
                        value1.clear();
                        value2.clear();
                                
                        value1.readFields(new DataInputStream(new ByteArrayInputStream(value1Data,value1Offset,value1Length)));
                        value2.readFields(new DataInputStream(new ByteArrayInputStream(value2Data,value2Offset,value2Length)));
                                
                        return compare(null, value1, null, value2);
                      } 
                              
                    },
                    */
                    new OptimizedKeyGeneratorAndComparator<Text, SubDomainMetadata>() {

                        @Override
                        public void generateOptimizedKeyForPair(Text key, SubDomainMetadata value,
                                org.commoncrawl.hadoop.mergeutils.OptimizedKeyGeneratorAndComparator.OptimizedKey optimizedKeyOut)
                                throws IOException {
                            optimizedKeyOut.setLongKeyValue(value.getUrlCount());
                        }

                        @Override
                        public int getGeneratedKeyType() {
                            return OptimizedKey.KEY_TYPE_LONG;
                        }
                    }, Text.class, SubDomainMetadata.class, false, null);

            try {

                // create a vector representing the single input segment 
                Vector<Path> singleInputSegment = new Vector<Path>();

                LOG.info("Execute Local for Query:" + getQueryId() + " Adding MergeResultsPath:"
                        + mergeResultsPath + " as input for Merger");
                singleInputSegment.add(mergeResultsPath);

                // create a SequenceFileReader
                SequenceFileReader<Text, SubDomainMetadata> mergeSegmentReader = new SequenceFileReader<Text, SubDomainMetadata>(
                        localFileSystem, conf, singleInputSegment, mergeSortSpillWriter, Text.class,
                        SubDomainMetadata.class);

                try {
                    LOG.info("Execute Local for Query:" + getQueryId() + " calling readAndSpill");
                    mergeSegmentReader.readAndSpill();
                    LOG.info("Execute Local for Query:" + getQueryId() + " readAndSpill finished");
                } finally {
                    if (mergeSegmentReader != null) {
                        mergeSegmentReader.close();
                    }
                }

            } finally {
                if (mergeSortSpillWriter != null) {
                    mergeSortSpillWriter.close();
                }
            }

        } finally {
            if (sortedResultsFileSpillWriter != null) {
                sortedResultsFileSpillWriter.close();
            }
        }
    }

    //LOG.info("Allocating SequenceFileIndex object for DomainListQuery Id:" + getQueryId() + " with Path:" + queryResultsPath);
    PositionBasedSequenceFileIndex<Text, SubDomainMetadata> indexFile = new PositionBasedSequenceFileIndex<Text, SubDomainMetadata>(
            localFileSystem, queryResultsPath, Text.class, SubDomainMetadata.class);
    //LOG.info("SequenceFileIndex object for DomainListQuery Id:" + getQueryId() + " with Path:" + queryResultsPath + " returned record count:" + indexFile.getRecordCount());

    return indexFile.getRecordCount();
}

From source file:org.commoncrawl.service.queryserver.query.DomainListQuery.java

License:Open Source License

@Override
public boolean requiresRemoteDispatch(FileSystem fileSystem, Configuration conf, ShardMapper shardMapper,
        QueryRequest<DomainListQueryInfo, Text, SubDomainMetadata> theClientRequest,
        ArrayList<ShardIndexHostNameTuple> shardIdToHostNameMapping) throws IOException {

    // get shard mappings for index ... 
    shardIdToHostNameMapping.addAll(shardMapper
            .mapShardIdsForIndex(DatabaseIndexV2.MasterDatabaseIndex.INDEX_NAME_DOMAIN_NAME_TO_METADATA));

    // create a set representing the collection of parts required to complete this query ... 
    Set<String> requiredParts = new HashSet<String>();

    for (ShardIndexHostNameTuple tuple : shardIdToHostNameMapping) {
        requiredParts.add(getPartNameForSlave(tuple.getShardId()));
    }//from  ww w. j  a  v a2 s.c om

    // now iterate parts available on hdfs ... 
    Path remoteQueryPath = getHDFSQueryResultsPath();
    //LOG.info("Results Path is:" + remoteQueryPath);

    FileStatus availableParts[] = fileSystem.globStatus(new Path(remoteQueryPath, "part-*"));

    for (FileStatus part : availableParts) {
        //LOG.info("Found Path:" + part.getPath());
        requiredParts.remove(part.getPath().getName());
    }

    // now check to see if all parts are available 
    if (requiredParts.size() != 0) {
        for (String part : requiredParts) {
            LOG.info("Required remote part:" + part + " NOT available yet.");
        }
        return true;
    } else {
        LOG.info("All parts required for query available.");
        return false;
    }
}

From source file:org.commoncrawl.util.CrawlLogSplitter.java

License:Open Source License

public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    FileStatus arcFiles[] = fs.globStatus(new Path("crawl/checkpoint_data/CrawlLog_*"));
    for (FileStatus candidate : arcFiles) {
        if (candidate.getLen() > SPLIT_SIZE) {
            candidateList.add(candidate.getPath());
        }/*from   w  w  w . j  a  va  2  s . co m*/
    }

    LOG.info("Found:" + candidateList.size() + " oversized candidates");

    Path tempOutputDir = new Path(conf.get("mapred.temp.dir", "."));

    while (candidateList.size() != 0) {
        Path candidateName = candidateList.first();
        candidateList.remove(candidateName);

        LOG.info("Processing Candidate:" + candidateName);
        long fileSize = fs.getFileStatus(candidateName).getLen();
        //get crawl log filename components

        ArrayList<Path> splitItems = new ArrayList<Path>();

        int index = 0;

        Path outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir, candidateName.getName(), index);

        LOG.info("Initial Output Path is:" + outputPart);

        fs.delete(outputPart, false);

        // create reader 
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, candidateName, conf);
        ValueBytes sourceVB = reader.createValueBytes();
        DataOutputBuffer sourceKeyData = new DataOutputBuffer();

        try {
            // ok create temp file 
            SequenceFile.Writer activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class,
                    CrawlURL.class, CompressionType.BLOCK, new SnappyCodec());

            // add to split items array 
            splitItems.add(outputPart);

            try {
                long recordsWritten = 0;
                while (reader.nextRawKey(sourceKeyData) != -1) {
                    reader.nextRawValue(sourceVB);
                    long lengthPreWrite = activeWriter.getLength();
                    activeWriter.appendRaw(sourceKeyData.getData(), 0, sourceKeyData.getLength(), sourceVB);
                    if (++recordsWritten % 10000 == 0) {
                        LOG.info("Write 10000 records");
                    }
                    long lengthPostWrite = activeWriter.getLength();
                    if (lengthPostWrite != lengthPreWrite) {
                        if (lengthPostWrite >= IDEAL_SIZE) {
                            LOG.info("Hit Split Point. Flushing File:" + outputPart);
                            activeWriter.close();
                            outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir,
                                    candidateName.getName(), ++index);
                            LOG.info("Creating New File:" + outputPart);
                            activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class,
                                    CrawlURL.class, CompressionType.BLOCK, new SnappyCodec());
                            splitItems.add(outputPart);
                        }
                    }
                    sourceKeyData.reset();
                }
            } finally {
                activeWriter.close();
            }
        } finally {
            reader.close();
        }
        LOG.info("Rewrote Source:" + candidateName + " into:" + splitItems.size() + " split files");
        for (Path splitItem : splitItems) {
            Path destPath = new Path("crawl/checkpoint_data", splitItem.getName());
            LOG.info("Moving:" + splitItem + " to:" + destPath);
            fs.rename(splitItem, destPath);
        }
        Path sourceMoveLocation = new Path("crawl/checkpoint_data_split", candidateName.getName());
        LOG.info("Moving SOURCE:" + candidateName + " to:" + sourceMoveLocation);
        fs.rename(candidateName, sourceMoveLocation);
    }
}

From source file:org.commoncrawl.util.HDFSUtils.java

License:Open Source License

public static long findLatestDatabaseTimestamp(FileSystem fs, Path rootPath) throws IOException {

    FileStatus candidates[] = fs.globStatus(new Path(rootPath, "*"));

    long candidateTimestamp = -1L;

    for (FileStatus candidate : candidates) {
        LOG.info("Found Seed Candidate:" + candidate.getPath());
        try {/*from   w w  w .  j a v  a  2s.  co m*/
            long timestamp = Long.parseLong(candidate.getPath().getName());
            if (candidateTimestamp == -1 || candidateTimestamp < timestamp) {
                candidateTimestamp = timestamp;

            }
        } catch (Exception e) {
            LOG.error("Invalid Path:" + candidate.getPath());
        }
    }
    LOG.info("Selected Candidate is:" + candidateTimestamp);
    return candidateTimestamp;
}

From source file:org.commoncrawl.util.NodeAffinityMaskBuilder.java

License:Open Source License

public static String buildNodeAffinityMask(FileSystem fileSystem, Path partFileDirectory,
        Map<Integer, String> optionalRootMapHint, Set<String> excludedNodeList, int maxReducersPerNode,
        boolean skipBalance) throws IOException {

    TreeMap<Integer, String> partitionToNodeMap = new TreeMap<Integer, String>();
    FileStatus paths[] = fileSystem.globStatus(new Path(partFileDirectory, "part-*"));

    if (paths.length == 0) {
        throw new IOException("Invalid source Path:" + partFileDirectory);
    }//from  w ww .  j  a  va 2s . c  o m

    Multimap<String, Integer> inverseMap = TreeMultimap.create();
    Map<Integer, List<String>> paritionToDesiredCandidateList = new TreeMap<Integer, List<String>>();

    // iterate paths 
    for (FileStatus path : paths) {

        String currentFile = path.getPath().getName();
        int partitionNumber;
        try {
            if (currentFile.startsWith("part-r")) {
                partitionNumber = NUMBER_FORMAT.parse(currentFile.substring("part-r-".length())).intValue();
            } else {
                partitionNumber = NUMBER_FORMAT.parse(currentFile.substring("part-".length())).intValue();
            }
        } catch (ParseException e) {
            throw new IOException("Invalid Part Name Encountered:" + currentFile);
        }

        // get block locations 
        BlockLocation locations[] = fileSystem.getFileBlockLocations(path, 0, path.getLen());

        // if passed in root map is not null, then validate that all blocks for the current file reside on the desired node 
        if (optionalRootMapHint != null) {
            // the host all blocks should reside on 
            String desiredHost = optionalRootMapHint.get(partitionNumber);

            ArrayList<String> misplacedBlocks = new ArrayList<String>();
            // ok walk all blocks 
            for (BlockLocation location : locations) {
                boolean found = false;
                for (String host : location.getHosts()) {
                    if (host.compareTo(desiredHost) == 0) {
                        found = true;
                        break;
                    }
                }
                if (!found) {
                    misplacedBlocks.add("Block At:" + location.getOffset() + " for File:" + path.getPath()
                            + " did not contain desired location:" + desiredHost);
                }

            }
            // ok pass test at a certain threshold 
            if (misplacedBlocks.size() != 0
                    && ((float) misplacedBlocks.size() / (float) locations.length) > .50f) {
                LOG.error("Misplaced Blocks Exceed Threshold");
                for (String misplacedBlock : misplacedBlocks) {
                    LOG.error(misplacedBlock);
                }
                // TODO: SKIP THIS STEP FOR NOW ??? 
                //throw new IOException("Misplaced Blocks Exceed Threshold!");
            }
            partitionToNodeMap.put(partitionNumber, desiredHost);
        } else {
            if (excludedNodeList != null) {
                // LOG.info("Exclued Node List is:" + Lists.newArrayList(excludedNodeList).toString());
            }
            // ok ask file system for block locations
            TreeMap<String, Integer> nodeToBlockCount = new TreeMap<String, Integer>();

            for (BlockLocation location : locations) {
                for (String host : location.getHosts()) {
                    if (excludedNodeList == null || !excludedNodeList.contains(host)) {
                        Integer nodeHitCount = nodeToBlockCount.get(host);
                        if (nodeHitCount == null) {
                            nodeToBlockCount.put(host, 1);
                        } else {
                            nodeToBlockCount.put(host, nodeHitCount.intValue() + 1);
                        }
                    }
                }
            }

            if (nodeToBlockCount.size() == 0) {
                throw new IOException("No valid nodes found for partition number:" + path);
            }

            Map.Entry<String, Integer> entries[] = nodeToBlockCount.entrySet().toArray(new Map.Entry[0]);
            Arrays.sort(entries, new Comparator<Map.Entry<String, Integer>>() {

                @Override
                public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) {
                    return o1.getValue().intValue() < o2.getValue().intValue() ? 1
                            : o1.getValue().intValue() == o2.getValue().intValue() ? 0 : -1;
                }
            });

            // build a list of nodes by priority ... 
            List<String> nodesByPriority = Lists.transform(Lists.newArrayList(entries),
                    new Function<Map.Entry<String, Integer>, String>() {

                        @Override
                        public String apply(Entry<String, Integer> entry) {
                            return entry.getKey();
                        }
                    });

            // stash it away ... 
            paritionToDesiredCandidateList.put(partitionNumber, nodesByPriority);
            //LOG.info("Mapping Partition:" + partitionNumber + " To Node:" + entries[0].getKey() + " BlockCount" + entries[0].getValue().intValue());
            partitionToNodeMap.put(partitionNumber, entries[0].getKey());
            // store the inverse mapping ... 
            inverseMap.put(entries[0].getKey(), partitionNumber);
        }
    }

    if (skipBalance) {
        // walk partition map to make sure everything is assigned ...
        /*
        for (String node : inverseMap.keys()) { 
          if (inverseMap.get(node).size() > maxReducersPerNode) { 
            throw new IOException("Node:" + node + " has too many partitions! ("+inverseMap.get(node).size());
          }
        }
        */
    }

    // now if optional root map hint is null 
    if (optionalRootMapHint == null && !skipBalance) {
        // figure out if there is an imbalance
        int avgRegionsPerNode = (int) Math.floor((float) paths.length / (float) inverseMap.keySet().size());
        int maxRegionsPerNode = (int) Math.ceil((float) paths.length / (float) inverseMap.keySet().size());
        LOG.info("Attempting to ideally balance nodes. Avg paritions per node:" + avgRegionsPerNode);

        // two passes .. 
        for (int pass = 0; pass < 2; ++pass) {
            LOG.info("Pass:" + pass);
            // iterate nodes ... 
            for (String node : ImmutableSet.copyOf(inverseMap.keySet())) {
                // get paritions in map  
                Collection<Integer> paritions = ImmutableList.copyOf(inverseMap.get(node));
                // if parition count exceeds desired average ... 
                if (paritions.size() > maxRegionsPerNode) {
                    // first pass, assign based on preference 
                    if (pass == 0) {
                        LOG.info("Node:" + node + " parition count:" + paritions.size() + " exceeds avg:"
                                + avgRegionsPerNode);
                        // walk partitions trying to find a node to discrard the parition to 
                        for (int partition : paritions) {
                            for (String candidate : paritionToDesiredCandidateList.get(partition)) {
                                if (!candidate.equals(node)) {
                                    // see if this candidate has room ..
                                    if (inverseMap.get(candidate).size() < avgRegionsPerNode) {
                                        LOG.info("REASSIGNING parition:" + partition + " from Node:" + node
                                                + " to Node:" + candidate);
                                        // found match reassign it ... 
                                        inverseMap.remove(node, partition);
                                        inverseMap.put(candidate, partition);
                                        break;
                                    }
                                }
                            }
                            // break out if reach our desired number of paritions for this node 
                            if (inverseMap.get(node).size() == avgRegionsPerNode)
                                break;
                        }
                    }
                    // second pass ... assign based on least loaded node ... 
                    else {
                        int desiredRelocations = paritions.size() - maxRegionsPerNode;
                        LOG.info("Desired Relocation for node:" + node + ":" + desiredRelocations
                                + " partitions:" + paritions.size());
                        for (int i = 0; i < desiredRelocations; ++i) {
                            String leastLoadedNode = null;
                            int leastLoadedNodePartitionCount = 0;

                            for (String candidateNode : inverseMap.keySet()) {
                                if (leastLoadedNode == null || inverseMap.get(candidateNode)
                                        .size() < leastLoadedNodePartitionCount) {
                                    leastLoadedNode = candidateNode;
                                    leastLoadedNodePartitionCount = inverseMap.get(candidateNode).size();
                                }
                            }
                            int bestPartition = -1;
                            int bestParitionOffset = -1;

                            for (int candidateParition : inverseMap.get(node)) {
                                int offset = 0;
                                for (String nodeCandidate : paritionToDesiredCandidateList
                                        .get(candidateParition)) {
                                    if (nodeCandidate.equals(leastLoadedNode)) {
                                        if (bestPartition == -1 || bestParitionOffset > offset) {
                                            bestPartition = candidateParition;
                                            bestParitionOffset = offset;
                                        }
                                        break;
                                    }
                                    offset++;
                                }
                            }
                            if (bestPartition == -1) {
                                bestPartition = Iterables.get(inverseMap.get(node), 0);
                            }
                            LOG.info("REASSIGNING parition:" + bestPartition + " from Node:" + node
                                    + " to Node:" + leastLoadedNode);
                            // found match reassign it ... 
                            inverseMap.remove(node, bestPartition);
                            inverseMap.put(leastLoadedNode, bestPartition);
                        }
                    }
                }
            }
        }
        LOG.info("Rebuilding parition to node map based on ideal balance");
        for (String node : inverseMap.keySet()) {
            LOG.info("Node:" + node + " has:" + inverseMap.get(node).size() + " partitions:"
                    + inverseMap.get(node).toString());
        }

        partitionToNodeMap.clear();
        for (Map.Entry<String, Integer> entry : inverseMap.entries()) {
            partitionToNodeMap.put(entry.getValue(), entry.getKey());
        }
    }

    StringBuilder builder = new StringBuilder();
    int itemCount = 0;
    for (Map.Entry<Integer, String> entry : partitionToNodeMap.entrySet()) {
        if (itemCount++ != 0)
            builder.append("\t");
        builder.append(entry.getKey().intValue() + "," + entry.getValue());
    }

    return builder.toString();
}

From source file:org.exem.flamingo.shared.util.HdfsUtils.java

License:Apache License

/**
 *  ?   ?  ??.//from  w w w  .j a  va 2  s  .  c om
 *
 * @param source ?? 
 * @param target ?? 
 * @param fs     Hadoop FileSystem
 */
public static void move(String source, String target, FileSystem fs) throws Exception {
    Path srcPath = new Path(source);
    Path[] srcs = FileUtil.stat2Paths(fs.globStatus(srcPath), srcPath);
    Path dst = new Path(target);
    if (srcs.length > 1 && !fs.getFileStatus(dst).isDirectory()) {
        throw new ServiceException("When moving multiple files, destination should be a directory.");
    }
    for (int i = 0; i < srcs.length; i++) {
        if (!fs.rename(srcs[i], dst)) {
            FileStatus srcFstatus = null;
            FileStatus dstFstatus = null;
            try {
                srcFstatus = fs.getFileStatus(srcs[i]);
            } catch (FileNotFoundException e) {
                throw new FileNotFoundException(srcs[i] + ": No such file or directory");
            }
            try {
                dstFstatus = fs.getFileStatus(dst);
            } catch (IOException e) {
                // Nothing
            }
            if ((srcFstatus != null) && (dstFstatus != null)) {
                if (srcFstatus.isDirectory() && !dstFstatus.isDirectory()) {
                    throw new ServiceException(
                            "cannot overwrite non directory " + dst + " with directory " + srcs[i]);
                }
            }
            throw new ServiceException("Failed to rename " + srcs[i] + " to " + dst);
        }
    }
}