Example usage for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:org.commoncrawl.service.pagerank.master.PageRankMaster.java

License:Open Source License

private int findListValidIteration(FileSystem fs, Path jobDataPath) throws IOException {
    // scan job directory for best value candidate 
    Path valueSearchPattern = new Path(jobDataPath, "value_*-00000");

    FileStatus candidates[] = fs.globStatus(valueSearchPattern);

    int lastValidIterationNo = -1;

    ArrayList<Path> iterationSpecificValues = new ArrayList<Path>();

    for (FileStatus candidate : candidates) {
        // extract iteration portion of name 
        String iterationStr = candidate.getPath().getName().substring("value_".length(), "value_".length() + 5);
        // parse 
        try {/*from  w  w  w  .  j  a  va2 s  . c  o m*/
            int iterationId = NUMBER_FORMAT.parse(iterationStr).intValue();
            // now see if we up to PR_NUM_SLAVES values 
            Path iterationSpecificSearchPattern = new Path(jobDataPath, "value_" + iterationStr + "-*");
            // count result 
            FileStatus iterationSpecificEntires[] = fs.globStatus(iterationSpecificSearchPattern);

            if (iterationSpecificEntires.length == CrawlEnvironment.PR_NUMSLAVES) {
                LOG.info("Iteration Number:" + iterationId + " has the proper number of results");

                if (lastValidIterationNo == -1 || lastValidIterationNo < iterationId) {
                    // set this iteration as the valid iteration
                    lastValidIterationNo = iterationId;
                    LOG.info("Setting Iteration:" + iterationId + " as last valid iteration number");
                    // clear candidate list 
                    iterationSpecificValues.clear();
                    // add paths to candidate list ... 
                    for (FileStatus iterationSpecificEntry : iterationSpecificEntires) {
                        iterationSpecificValues.add(iterationSpecificEntry.getPath());
                    }
                }
            } else {
                LOG.error("Skipping Iteration Number:" + iterationId + ". It only has:"
                        + iterationSpecificEntires.length + "results");
            }

        } catch (ParseException e) {
            LOG.error(CCStringUtils.stringifyException(e));
        }
    }
    return lastValidIterationNo;
}

From source file:org.commoncrawl.service.pagerank.master.PageRankMaster.java

License:Open Source License

private void loadState() throws IOException {
    FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

    // allocate a new server state ... 
    _serverState = new PRMasterState();
    // initially in idle state 
    _serverState.setServerStatus(PRMasterState.ServerStatus.IDLE);
    // paths //from www .j a va  2  s. c  o m
    Path valuesFilePath = new Path("crawl/pageRank/seed/" + _jobId + "/values");
    Path edgesFilePath = new Path("crawl/pageRank/seed/" + _jobId + "/edges");
    // figure out number of values
    int itemCount = fs.globStatus(new Path(valuesFilePath, "value_*")).length;
    LOG.info("There are:" + itemCount + " values for job:" + _jobId);
    Path jobsPath = new Path("crawl/pageRank/jobs/" + _jobId);
    fs.mkdirs(jobsPath);
    // find last valid iteration 
    int lastValidIteration = findListValidIteration(fs, jobsPath);
    LOG.info("Last Valid Iteration for Job:" + _jobId + " is:" + lastValidIteration);
    int nextIteration = lastValidIteration + 1;
    // clear all check point and distribution files 
    clearAllCheckpointAndDistributionFiles(fs, jobsPath);

    PageRankJobConfig jobConfig = new PageRankJobConfig();

    jobConfig.setJobId(_jobId);
    jobConfig.setIterationNumber(nextIteration);
    jobConfig.setMaxIterationNumber(1000);
    jobConfig.setSlaveCount(itemCount);
    jobConfig.setInputValuesPath(valuesFilePath.toString());
    jobConfig.setOutlinksDataPath(edgesFilePath.toString());
    // set up job dir ... 
    Path jobPath = new Path(_hdfsWorkingDir, Long.toString(jobConfig.getJobId()));
    jobConfig.setJobWorkPath(jobPath.toString());
    jobConfig.setAlgorithmId(0);
    jobConfig.setAlpha(.85f);

    _jobConfig = jobConfig;

    // update server state ...
    _serverState.setActiveJobConfig(_jobConfig);
    _serverState.setFieldDirty(PRMasterState.Field_ACTIVEJOBCONFIG);
    _serverState.setServerStatus(PRMasterState.ServerStatus.STARTED);

}

From source file:org.commoncrawl.service.queryserver.master.MasterServer.java

License:Open Source License

private Path locateQueryDBPath() throws IOException {
    FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

    FileStatus statusArray[] = fs.globStatus(new Path("crawl/querydb/db/*"));

    Path candidatePath = null;/*  ww w  . j a  v  a  2 s .c  om*/
    for (FileStatus fileStatus : statusArray) {
        if (candidatePath == null) {
            candidatePath = fileStatus.getPath();
        } else {
            long prevTimestamp = Long.parseLong(candidatePath.getName());
            long currentTimestamp = Long.parseLong(fileStatus.getPath().getName());
            if (currentTimestamp > prevTimestamp) {
                candidatePath = fileStatus.getPath();
            }
        }
    }
    if (candidatePath != null) {
        LOG.info("Selected Candidate Path:" + candidatePath);
    }
    return candidatePath;
}

From source file:org.commoncrawl.service.queryserver.master.QueryServerFE.java

License:Open Source License

private static long findLatestDatabaseTimestamp(Path rootPath) throws IOException {
    FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

    FileStatus candidates[] = fs.globStatus(new Path(rootPath, "*"));

    long candidateTimestamp = -1L;

    for (FileStatus candidate : candidates) {
        LOG.info("Found Seed Candidate:" + candidate.getPath());
        long timestamp = Long.parseLong(candidate.getPath().getName());
        if (candidateTimestamp == -1 || candidateTimestamp < timestamp) {
            candidateTimestamp = timestamp;
        }//from  w  ww  . ja  va  2s. co m
    }
    LOG.info("Selected Candidate is:" + candidateTimestamp);
    return candidateTimestamp;
}

From source file:org.commoncrawl.service.queryserver.query.DomainListQuery.java

License:Open Source License

@Override
protected long executeLocal(FileSystem remoteFileSystem, Configuration conf,
        DatabaseIndexV2.MasterDatabaseIndex index, EventLoop eventLoop, File tempFirDir,
        QueryRequest<DomainListQueryInfo, Text, SubDomainMetadata> requestObject) throws IOException {

    Path mergeResultsPath = new Path(
            getLocalQueryResultsPathPrefix(requestObject) + getMergedResultsFileName());

    LOG.info("Execute Local called for Query:" + getQueryId() + " MergeResultsPath is:" + mergeResultsPath);

    // get a local file system object
    FileSystem localFileSystem = FileSystem.getLocal(conf);

    //LOG.info("Executing LocalQuery - checking if MergedFile:" + mergeResultsPath + " Exists");
    // if source merged results path does not exist ... 
    if (!localFileSystem.exists(mergeResultsPath)) {
        LOG.info("Execute Local for Query:" + getQueryId() + " Source MergeFile:" + mergeResultsPath
                + " Not Found. Checking for parts files");
        // collect parts ...
        Vector<Path> parts = new Vector<Path>();

        FileStatus fileStatusArray[] = remoteFileSystem
                .globStatus(new Path(getHDFSQueryResultsPath(), "part-*"));

        if (fileStatusArray.length == 0) {
            LOG.error("Execute Local for Query:" + getQueryId() + " FAILED. No Parts Files Found!");
            throw new IOException("Remote Component Part Files Not Found");
        }/*www .j  a v a 2s  .c o  m*/

        for (FileStatus part : fileStatusArray) {
            //LOG.info("Found Part:"+ part);
            parts.add(part.getPath());
        }

        LOG.info("Execute Local for Query:" + getQueryId() + " Initializing Merger");
        SequenceFileSpillWriter<Text, SubDomainMetadata> mergedFileSpillWriter = new SequenceFileSpillWriter<Text, SubDomainMetadata>(
                localFileSystem, conf, mergeResultsPath, Text.class, SubDomainMetadata.class,
                new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem,
                        PositionBasedSequenceFileIndex.getIndexNameFromBaseName(mergeResultsPath)),
                false);

        try {
            SequenceFileMerger<Text, SubDomainMetadata> merger = new SequenceFileMerger<Text, SubDomainMetadata>(
                    remoteFileSystem, conf, parts, mergedFileSpillWriter, Text.class, SubDomainMetadata.class,

                    new RawKeyValueComparator<Text, SubDomainMetadata>() {

                        DataInputBuffer key1Stream = new DataInputBuffer();
                        DataInputBuffer key2Stream = new DataInputBuffer();

                        @Override
                        public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data,
                                int key2Offset, int key2Length, byte[] value1Data, int value1Offset,
                                int value1Length, byte[] value2Data, int value2Offset, int value2Length)
                                throws IOException {

                            key1Stream.reset(key1Data, key1Offset, key1Length);
                            key2Stream.reset(key2Data, key2Offset, key2Length);

                            WritableUtils.readVInt(key1Stream);
                            WritableUtils.readVInt(key2Stream);

                            return BytesWritable.Comparator.compareBytes(key1Data, key1Stream.getPosition(),
                                    key1Length - key1Stream.getPosition(), key2Data, key2Stream.getPosition(),
                                    key2Length - key2Stream.getPosition());
                        }

                        @Override
                        public int compare(Text key1, SubDomainMetadata value1, Text key2,
                                SubDomainMetadata value2) {
                            return key1.compareTo(key2);
                        }

                    });

            try {
                LOG.info("Execute Local for Query:" + getQueryId() + " Running Merger");
                merger.mergeAndSpill(null);
                LOG.info("Execute Local for Query:" + getQueryId()
                        + " Merge Successfull.. Deleting Merge Inputs");
                for (Path inputPath : parts) {
                    remoteFileSystem.delete(inputPath, false);
                }
            } catch (IOException e) {
                LOG.error("Execute Local for Query:" + getQueryId() + " Merge Failed with Exception:"
                        + CCStringUtils.stringifyException(e));
                throw e;
            } finally {
                LOG.info("** CLOSING MERGER");
                merger.close();
            }
        } finally {
            LOG.info("** FLUSHING SPILLWRITER");
            mergedFileSpillWriter.close();
        }
    }

    // now check for query specific merge file ...
    Path queryResultsPath = new Path(getLocalQueryResultsPathPrefix(requestObject)
            + getOutputFileNameBasedOnSortByField(requestObject.getClientQueryInfo().getSortByField()));

    LOG.info("Execute Local for Query:" + getQueryId() + " Checking for QueryResultsPath:" + queryResultsPath);

    if (!localFileSystem.exists(queryResultsPath)) {

        LOG.info("Exectue Local for Query:" + getQueryId() + " Results File:" + queryResultsPath
                + " does not exist. Running sort and merge process");

        LOG.info("Execute Local for Query:" + getQueryId() + " Allocating SpillWriter with output to:"
                + queryResultsPath);
        // allocate a spill writer ...  
        SequenceFileSpillWriter<Text, SubDomainMetadata> sortedResultsFileSpillWriter = new SequenceFileSpillWriter<Text, SubDomainMetadata>(
                localFileSystem, conf, queryResultsPath, Text.class, SubDomainMetadata.class,
                new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem,
                        PositionBasedSequenceFileIndex.getIndexNameFromBaseName(queryResultsPath)),
                false);

        try {

            LOG.info("Execute Local for Query:" + getQueryId() + " Allocating MergeSortSpillWriter");
            // and connect it to the merge spill writer ...
            MergeSortSpillWriter<Text, SubDomainMetadata> mergeSortSpillWriter = new MergeSortSpillWriter<Text, SubDomainMetadata>(
                    conf, sortedResultsFileSpillWriter, localFileSystem, new Path(tempFirDir.getAbsolutePath()),
                    /*
                    new RawKeyValueComparator<Text,SubDomainMetadata>() {
                            
                      SubDomainMetadata value1 = new SubDomainMetadata();
                      SubDomainMetadata value2 = new SubDomainMetadata();
                              
                            
                      @Override
                      public int compare(Text key1, SubDomainMetadata value1, Text key2,SubDomainMetadata value2) {
                        return value1.getUrlCount() - value2.getUrlCount();
                      }
                            
                      @Override
                      public int compareRaw(byte[] key1Data, int key1Offset,
                          int key1Length, byte[] key2Data, int key2Offset,
                          int key2Length, byte[] value1Data, int value1Offset,
                          int value1Length, byte[] value2Data, int value2Offset,
                          int value2Length) throws IOException {
                            
                        value1.clear();
                        value2.clear();
                                
                        value1.readFields(new DataInputStream(new ByteArrayInputStream(value1Data,value1Offset,value1Length)));
                        value2.readFields(new DataInputStream(new ByteArrayInputStream(value2Data,value2Offset,value2Length)));
                                
                        return compare(null, value1, null, value2);
                      } 
                              
                    },
                    */
                    new OptimizedKeyGeneratorAndComparator<Text, SubDomainMetadata>() {

                        @Override
                        public void generateOptimizedKeyForPair(Text key, SubDomainMetadata value,
                                org.commoncrawl.hadoop.mergeutils.OptimizedKeyGeneratorAndComparator.OptimizedKey optimizedKeyOut)
                                throws IOException {
                            optimizedKeyOut.setLongKeyValue(value.getUrlCount());
                        }

                        @Override
                        public int getGeneratedKeyType() {
                            return OptimizedKey.KEY_TYPE_LONG;
                        }
                    }, Text.class, SubDomainMetadata.class, false, null);

            try {

                // create a vector representing the single input segment 
                Vector<Path> singleInputSegment = new Vector<Path>();

                LOG.info("Execute Local for Query:" + getQueryId() + " Adding MergeResultsPath:"
                        + mergeResultsPath + " as input for Merger");
                singleInputSegment.add(mergeResultsPath);

                // create a SequenceFileReader
                SequenceFileReader<Text, SubDomainMetadata> mergeSegmentReader = new SequenceFileReader<Text, SubDomainMetadata>(
                        localFileSystem, conf, singleInputSegment, mergeSortSpillWriter, Text.class,
                        SubDomainMetadata.class);

                try {
                    LOG.info("Execute Local for Query:" + getQueryId() + " calling readAndSpill");
                    mergeSegmentReader.readAndSpill();
                    LOG.info("Execute Local for Query:" + getQueryId() + " readAndSpill finished");
                } finally {
                    if (mergeSegmentReader != null) {
                        mergeSegmentReader.close();
                    }
                }

            } finally {
                if (mergeSortSpillWriter != null) {
                    mergeSortSpillWriter.close();
                }
            }

        } finally {
            if (sortedResultsFileSpillWriter != null) {
                sortedResultsFileSpillWriter.close();
            }
        }
    }

    //LOG.info("Allocating SequenceFileIndex object for DomainListQuery Id:" + getQueryId() + " with Path:" + queryResultsPath);
    PositionBasedSequenceFileIndex<Text, SubDomainMetadata> indexFile = new PositionBasedSequenceFileIndex<Text, SubDomainMetadata>(
            localFileSystem, queryResultsPath, Text.class, SubDomainMetadata.class);
    //LOG.info("SequenceFileIndex object for DomainListQuery Id:" + getQueryId() + " with Path:" + queryResultsPath + " returned record count:" + indexFile.getRecordCount());

    return indexFile.getRecordCount();
}

From source file:org.commoncrawl.service.queryserver.query.DomainListQuery.java

License:Open Source License

@Override
public boolean requiresRemoteDispatch(FileSystem fileSystem, Configuration conf, ShardMapper shardMapper,
        QueryRequest<DomainListQueryInfo, Text, SubDomainMetadata> theClientRequest,
        ArrayList<ShardIndexHostNameTuple> shardIdToHostNameMapping) throws IOException {

    // get shard mappings for index ... 
    shardIdToHostNameMapping.addAll(shardMapper
            .mapShardIdsForIndex(DatabaseIndexV2.MasterDatabaseIndex.INDEX_NAME_DOMAIN_NAME_TO_METADATA));

    // create a set representing the collection of parts required to complete this query ... 
    Set<String> requiredParts = new HashSet<String>();

    for (ShardIndexHostNameTuple tuple : shardIdToHostNameMapping) {
        requiredParts.add(getPartNameForSlave(tuple.getShardId()));
    }//from  ww w. j  a  v a2 s.c om

    // now iterate parts available on hdfs ... 
    Path remoteQueryPath = getHDFSQueryResultsPath();
    //LOG.info("Results Path is:" + remoteQueryPath);

    FileStatus availableParts[] = fileSystem.globStatus(new Path(remoteQueryPath, "part-*"));

    for (FileStatus part : availableParts) {
        //LOG.info("Found Path:" + part.getPath());
        requiredParts.remove(part.getPath().getName());
    }

    // now check to see if all parts are available 
    if (requiredParts.size() != 0) {
        for (String part : requiredParts) {
            LOG.info("Required remote part:" + part + " NOT available yet.");
        }
        return true;
    } else {
        LOG.info("All parts required for query available.");
        return false;
    }
}

From source file:org.commoncrawl.util.CrawlLogSplitter.java

License:Open Source License

public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    FileStatus arcFiles[] = fs.globStatus(new Path("crawl/checkpoint_data/CrawlLog_*"));
    for (FileStatus candidate : arcFiles) {
        if (candidate.getLen() > SPLIT_SIZE) {
            candidateList.add(candidate.getPath());
        }/*from   w  w  w . j  a  va  2  s . co m*/
    }

    LOG.info("Found:" + candidateList.size() + " oversized candidates");

    Path tempOutputDir = new Path(conf.get("mapred.temp.dir", "."));

    while (candidateList.size() != 0) {
        Path candidateName = candidateList.first();
        candidateList.remove(candidateName);

        LOG.info("Processing Candidate:" + candidateName);
        long fileSize = fs.getFileStatus(candidateName).getLen();
        //get crawl log filename components

        ArrayList<Path> splitItems = new ArrayList<Path>();

        int index = 0;

        Path outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir, candidateName.getName(), index);

        LOG.info("Initial Output Path is:" + outputPart);

        fs.delete(outputPart, false);

        // create reader 
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, candidateName, conf);
        ValueBytes sourceVB = reader.createValueBytes();
        DataOutputBuffer sourceKeyData = new DataOutputBuffer();

        try {
            // ok create temp file 
            SequenceFile.Writer activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class,
                    CrawlURL.class, CompressionType.BLOCK, new SnappyCodec());

            // add to split items array 
            splitItems.add(outputPart);

            try {
                long recordsWritten = 0;
                while (reader.nextRawKey(sourceKeyData) != -1) {
                    reader.nextRawValue(sourceVB);
                    long lengthPreWrite = activeWriter.getLength();
                    activeWriter.appendRaw(sourceKeyData.getData(), 0, sourceKeyData.getLength(), sourceVB);
                    if (++recordsWritten % 10000 == 0) {
                        LOG.info("Write 10000 records");
                    }
                    long lengthPostWrite = activeWriter.getLength();
                    if (lengthPostWrite != lengthPreWrite) {
                        if (lengthPostWrite >= IDEAL_SIZE) {
                            LOG.info("Hit Split Point. Flushing File:" + outputPart);
                            activeWriter.close();
                            outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir,
                                    candidateName.getName(), ++index);
                            LOG.info("Creating New File:" + outputPart);
                            activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class,
                                    CrawlURL.class, CompressionType.BLOCK, new SnappyCodec());
                            splitItems.add(outputPart);
                        }
                    }
                    sourceKeyData.reset();
                }
            } finally {
                activeWriter.close();
            }
        } finally {
            reader.close();
        }
        LOG.info("Rewrote Source:" + candidateName + " into:" + splitItems.size() + " split files");
        for (Path splitItem : splitItems) {
            Path destPath = new Path("crawl/checkpoint_data", splitItem.getName());
            LOG.info("Moving:" + splitItem + " to:" + destPath);
            fs.rename(splitItem, destPath);
        }
        Path sourceMoveLocation = new Path("crawl/checkpoint_data_split", candidateName.getName());
        LOG.info("Moving SOURCE:" + candidateName + " to:" + sourceMoveLocation);
        fs.rename(candidateName, sourceMoveLocation);
    }
}

From source file:org.commoncrawl.util.HDFSUtils.java

License:Open Source License

public static long findLatestDatabaseTimestamp(FileSystem fs, Path rootPath) throws IOException {

    FileStatus candidates[] = fs.globStatus(new Path(rootPath, "*"));

    long candidateTimestamp = -1L;

    for (FileStatus candidate : candidates) {
        LOG.info("Found Seed Candidate:" + candidate.getPath());
        try {/*from   w w  w .  j a v  a  2s.  co m*/
            long timestamp = Long.parseLong(candidate.getPath().getName());
            if (candidateTimestamp == -1 || candidateTimestamp < timestamp) {
                candidateTimestamp = timestamp;

            }
        } catch (Exception e) {
            LOG.error("Invalid Path:" + candidate.getPath());
        }
    }
    LOG.info("Selected Candidate is:" + candidateTimestamp);
    return candidateTimestamp;
}

From source file:org.commoncrawl.util.NodeAffinityMaskBuilder.java

License:Open Source License

public static String buildNodeAffinityMask(FileSystem fileSystem, Path partFileDirectory,
        Map<Integer, String> optionalRootMapHint, Set<String> excludedNodeList, int maxReducersPerNode,
        boolean skipBalance) throws IOException {

    TreeMap<Integer, String> partitionToNodeMap = new TreeMap<Integer, String>();
    FileStatus paths[] = fileSystem.globStatus(new Path(partFileDirectory, "part-*"));

    if (paths.length == 0) {
        throw new IOException("Invalid source Path:" + partFileDirectory);
    }//from  w ww .  j  a  va 2s . c  o m

    Multimap<String, Integer> inverseMap = TreeMultimap.create();
    Map<Integer, List<String>> paritionToDesiredCandidateList = new TreeMap<Integer, List<String>>();

    // iterate paths 
    for (FileStatus path : paths) {

        String currentFile = path.getPath().getName();
        int partitionNumber;
        try {
            if (currentFile.startsWith("part-r")) {
                partitionNumber = NUMBER_FORMAT.parse(currentFile.substring("part-r-".length())).intValue();
            } else {
                partitionNumber = NUMBER_FORMAT.parse(currentFile.substring("part-".length())).intValue();
            }
        } catch (ParseException e) {
            throw new IOException("Invalid Part Name Encountered:" + currentFile);
        }

        // get block locations 
        BlockLocation locations[] = fileSystem.getFileBlockLocations(path, 0, path.getLen());

        // if passed in root map is not null, then validate that all blocks for the current file reside on the desired node 
        if (optionalRootMapHint != null) {
            // the host all blocks should reside on 
            String desiredHost = optionalRootMapHint.get(partitionNumber);

            ArrayList<String> misplacedBlocks = new ArrayList<String>();
            // ok walk all blocks 
            for (BlockLocation location : locations) {
                boolean found = false;
                for (String host : location.getHosts()) {
                    if (host.compareTo(desiredHost) == 0) {
                        found = true;
                        break;
                    }
                }
                if (!found) {
                    misplacedBlocks.add("Block At:" + location.getOffset() + " for File:" + path.getPath()
                            + " did not contain desired location:" + desiredHost);
                }

            }
            // ok pass test at a certain threshold 
            if (misplacedBlocks.size() != 0
                    && ((float) misplacedBlocks.size() / (float) locations.length) > .50f) {
                LOG.error("Misplaced Blocks Exceed Threshold");
                for (String misplacedBlock : misplacedBlocks) {
                    LOG.error(misplacedBlock);
                }
                // TODO: SKIP THIS STEP FOR NOW ??? 
                //throw new IOException("Misplaced Blocks Exceed Threshold!");
            }
            partitionToNodeMap.put(partitionNumber, desiredHost);
        } else {
            if (excludedNodeList != null) {
                // LOG.info("Exclued Node List is:" + Lists.newArrayList(excludedNodeList).toString());
            }
            // ok ask file system for block locations
            TreeMap<String, Integer> nodeToBlockCount = new TreeMap<String, Integer>();

            for (BlockLocation location : locations) {
                for (String host : location.getHosts()) {
                    if (excludedNodeList == null || !excludedNodeList.contains(host)) {
                        Integer nodeHitCount = nodeToBlockCount.get(host);
                        if (nodeHitCount == null) {
                            nodeToBlockCount.put(host, 1);
                        } else {
                            nodeToBlockCount.put(host, nodeHitCount.intValue() + 1);
                        }
                    }
                }
            }

            if (nodeToBlockCount.size() == 0) {
                throw new IOException("No valid nodes found for partition number:" + path);
            }

            Map.Entry<String, Integer> entries[] = nodeToBlockCount.entrySet().toArray(new Map.Entry[0]);
            Arrays.sort(entries, new Comparator<Map.Entry<String, Integer>>() {

                @Override
                public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) {
                    return o1.getValue().intValue() < o2.getValue().intValue() ? 1
                            : o1.getValue().intValue() == o2.getValue().intValue() ? 0 : -1;
                }
            });

            // build a list of nodes by priority ... 
            List<String> nodesByPriority = Lists.transform(Lists.newArrayList(entries),
                    new Function<Map.Entry<String, Integer>, String>() {

                        @Override
                        public String apply(Entry<String, Integer> entry) {
                            return entry.getKey();
                        }
                    });

            // stash it away ... 
            paritionToDesiredCandidateList.put(partitionNumber, nodesByPriority);
            //LOG.info("Mapping Partition:" + partitionNumber + " To Node:" + entries[0].getKey() + " BlockCount" + entries[0].getValue().intValue());
            partitionToNodeMap.put(partitionNumber, entries[0].getKey());
            // store the inverse mapping ... 
            inverseMap.put(entries[0].getKey(), partitionNumber);
        }
    }

    if (skipBalance) {
        // walk partition map to make sure everything is assigned ...
        /*
        for (String node : inverseMap.keys()) { 
          if (inverseMap.get(node).size() > maxReducersPerNode) { 
            throw new IOException("Node:" + node + " has too many partitions! ("+inverseMap.get(node).size());
          }
        }
        */
    }

    // now if optional root map hint is null 
    if (optionalRootMapHint == null && !skipBalance) {
        // figure out if there is an imbalance
        int avgRegionsPerNode = (int) Math.floor((float) paths.length / (float) inverseMap.keySet().size());
        int maxRegionsPerNode = (int) Math.ceil((float) paths.length / (float) inverseMap.keySet().size());
        LOG.info("Attempting to ideally balance nodes. Avg paritions per node:" + avgRegionsPerNode);

        // two passes .. 
        for (int pass = 0; pass < 2; ++pass) {
            LOG.info("Pass:" + pass);
            // iterate nodes ... 
            for (String node : ImmutableSet.copyOf(inverseMap.keySet())) {
                // get paritions in map  
                Collection<Integer> paritions = ImmutableList.copyOf(inverseMap.get(node));
                // if parition count exceeds desired average ... 
                if (paritions.size() > maxRegionsPerNode) {
                    // first pass, assign based on preference 
                    if (pass == 0) {
                        LOG.info("Node:" + node + " parition count:" + paritions.size() + " exceeds avg:"
                                + avgRegionsPerNode);
                        // walk partitions trying to find a node to discrard the parition to 
                        for (int partition : paritions) {
                            for (String candidate : paritionToDesiredCandidateList.get(partition)) {
                                if (!candidate.equals(node)) {
                                    // see if this candidate has room ..
                                    if (inverseMap.get(candidate).size() < avgRegionsPerNode) {
                                        LOG.info("REASSIGNING parition:" + partition + " from Node:" + node
                                                + " to Node:" + candidate);
                                        // found match reassign it ... 
                                        inverseMap.remove(node, partition);
                                        inverseMap.put(candidate, partition);
                                        break;
                                    }
                                }
                            }
                            // break out if reach our desired number of paritions for this node 
                            if (inverseMap.get(node).size() == avgRegionsPerNode)
                                break;
                        }
                    }
                    // second pass ... assign based on least loaded node ... 
                    else {
                        int desiredRelocations = paritions.size() - maxRegionsPerNode;
                        LOG.info("Desired Relocation for node:" + node + ":" + desiredRelocations
                                + " partitions:" + paritions.size());
                        for (int i = 0; i < desiredRelocations; ++i) {
                            String leastLoadedNode = null;
                            int leastLoadedNodePartitionCount = 0;

                            for (String candidateNode : inverseMap.keySet()) {
                                if (leastLoadedNode == null || inverseMap.get(candidateNode)
                                        .size() < leastLoadedNodePartitionCount) {
                                    leastLoadedNode = candidateNode;
                                    leastLoadedNodePartitionCount = inverseMap.get(candidateNode).size();
                                }
                            }
                            int bestPartition = -1;
                            int bestParitionOffset = -1;

                            for (int candidateParition : inverseMap.get(node)) {
                                int offset = 0;
                                for (String nodeCandidate : paritionToDesiredCandidateList
                                        .get(candidateParition)) {
                                    if (nodeCandidate.equals(leastLoadedNode)) {
                                        if (bestPartition == -1 || bestParitionOffset > offset) {
                                            bestPartition = candidateParition;
                                            bestParitionOffset = offset;
                                        }
                                        break;
                                    }
                                    offset++;
                                }
                            }
                            if (bestPartition == -1) {
                                bestPartition = Iterables.get(inverseMap.get(node), 0);
                            }
                            LOG.info("REASSIGNING parition:" + bestPartition + " from Node:" + node
                                    + " to Node:" + leastLoadedNode);
                            // found match reassign it ... 
                            inverseMap.remove(node, bestPartition);
                            inverseMap.put(leastLoadedNode, bestPartition);
                        }
                    }
                }
            }
        }
        LOG.info("Rebuilding parition to node map based on ideal balance");
        for (String node : inverseMap.keySet()) {
            LOG.info("Node:" + node + " has:" + inverseMap.get(node).size() + " partitions:"
                    + inverseMap.get(node).toString());
        }

        partitionToNodeMap.clear();
        for (Map.Entry<String, Integer> entry : inverseMap.entries()) {
            partitionToNodeMap.put(entry.getValue(), entry.getKey());
        }
    }

    StringBuilder builder = new StringBuilder();
    int itemCount = 0;
    for (Map.Entry<Integer, String> entry : partitionToNodeMap.entrySet()) {
        if (itemCount++ != 0)
            builder.append("\t");
        builder.append(entry.getKey().intValue() + "," + entry.getValue());
    }

    return builder.toString();
}

From source file:org.exem.flamingo.shared.util.HdfsUtils.java

License:Apache License

/**
 *  ?   ?  ??.//from  w w w  .j a  va 2  s  .  c om
 *
 * @param source ?? 
 * @param target ?? 
 * @param fs     Hadoop FileSystem
 */
public static void move(String source, String target, FileSystem fs) throws Exception {
    Path srcPath = new Path(source);
    Path[] srcs = FileUtil.stat2Paths(fs.globStatus(srcPath), srcPath);
    Path dst = new Path(target);
    if (srcs.length > 1 && !fs.getFileStatus(dst).isDirectory()) {
        throw new ServiceException("When moving multiple files, destination should be a directory.");
    }
    for (int i = 0; i < srcs.length; i++) {
        if (!fs.rename(srcs[i], dst)) {
            FileStatus srcFstatus = null;
            FileStatus dstFstatus = null;
            try {
                srcFstatus = fs.getFileStatus(srcs[i]);
            } catch (FileNotFoundException e) {
                throw new FileNotFoundException(srcs[i] + ": No such file or directory");
            }
            try {
                dstFstatus = fs.getFileStatus(dst);
            } catch (IOException e) {
                // Nothing
            }
            if ((srcFstatus != null) && (dstFstatus != null)) {
                if (srcFstatus.isDirectory() && !dstFstatus.isDirectory()) {
                    throw new ServiceException(
                            "cannot overwrite non directory " + dst + " with directory " + srcs[i]);
                }
            }
            throw new ServiceException("Failed to rename " + srcs[i] + " to " + dst);
        }
    }
}