Example usage for org.apache.hadoop.fs FileStatus getPath

List of usage examples for org.apache.hadoop.fs FileStatus getPath

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileStatus getPath.

Prototype

public Path getPath() 

Source Link

Usage

From source file:com.ibm.stocator.fs.swift2d.systemtests.SwiftTestUtils.java

License:Open Source License

/**
 * Assert that a FileSystem.listStatus on a dir finds the subdir/child entry
 * @param fs filesystem// www  .  ja v a2 s . co  m
 * @param dir directory to scan
 * @param subdir full path to look for
 * @throws IOException IO problems
 */
public static void assertListStatusFinds(FileSystem fs, Path dir, Path subdir) throws IOException {
    FileStatus[] stats = fs.listStatus(dir);
    boolean found = false;
    StringBuilder builder = new StringBuilder();
    for (FileStatus stat : stats) {
        builder.append(stat.toString()).append('\n');
        if (stat.getPath().equals(subdir)) {
            found = true;
        }
    }
    assertTrue("Path " + subdir + " not found in directory " + dir + ":" + builder, found);
}

From source file:com.ibm.stocator.fs.swift2d.systemtests.TestSwiftFileSystemLsOperations.java

License:Apache License

@Ignore("Unexpected")
public void testListNonEmptyRoot() throws Throwable {
    Path test = path(getBaseURI() + "/test");
    touch(sFileSystem, test);/*from w  ww .  j  ava  2  s  .  co  m*/
    FileStatus[] fileStatuses = sFileSystem.listStatus(path(getBaseURI() + "/"));
    String stats = dumpStats("/", fileStatuses);
    assertEquals("Wrong #of root children" + stats, 1, fileStatuses.length);
    FileStatus status = fileStatuses[0];
    assertEquals("Wrong path value" + stats, test, status.getPath());
}

From source file:com.ibm.stocator.fs.swift2d.systemtests.TestSwiftFileSystemLsOperations.java

License:Apache License

@Ignore("Not supported")
public void testListStatusFiltered() throws Throwable {
    Path dir = path(getBaseURI() + "/");
    Path child = path(getBaseURI() + "/test");
    touch(sFileSystem, child);/*from  ww  w  .j a v a  2 s  .  co m*/
    FileStatus[] stats = sFileSystem.listStatus(dir, new AcceptAllFilter());
    boolean found = false;
    StringBuilder builder = new StringBuilder();
    for (FileStatus stat : stats) {
        builder.append(stat.toString()).append('\n');
        if (stat.getPath().equals(child)) {
            found = true;
        }
    }
    assertTrue("Path " + child + " not found in directory " + dir + ":" + builder, found);
}

From source file:com.iflytek.spider.crawl.GeneratorSmart.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or not
 * is read from the crawl.generate.filter property in the configuration files.
 * If the property is not found, the URLs are filtered. Same for the
 * normalisation.//  w ww  . j ava2 s  .c o  m
 * 
 * @param dbDir
 *          Crawl database directory
 * @param segments
 *          Segments directory
 * @param numLists
 *          Number of reduce tasks
 * @param curTime
 *          Current time in milliseconds
 * 
 * @return Path to generated segment or null if no entries were selected
 * 
 * @throws IOException
 *           When an I/O error occurs
 * @throws ClassNotFoundException
 * @throws InterruptedException
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long curTime, boolean force)
        throws IOException, InterruptedException, ClassNotFoundException {
    //getConf().set("mapred.temp.dir", "d:/tmp");
    Path tempDir = new Path(
            getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

    Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
    FileSystem fs = FileSystem.get(getConf());
    LockUtil.createLockFile(fs, lock, force);

    LOG.info("Generator: Selecting best-scoring urls due for fetch.");
    LOG.info("Generator: starting");

    Job job = AvroJob.getAvroJob(getConf());
    if (numLists == -1) { // for politeness make
        numLists = job.getNumReduceTasks(); // a partition per fetch task
    }
    if ("local".equals(job.getConfiguration().get("mapred.job.tracker")) && numLists != 1) {
        // override
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        numLists = 1;
    }
    LOG.info("Generator: with " + numLists + " partition.");
    job.getConfiguration().setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime);

    FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
    job.setInputFormatClass(AvroPairInputFormat.class);

    job.setMapperClass(SelectorMapper.class);
    job.setReducerClass(SelectorReducer.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    //job.setOutputFormatClass(AvroPairOutputFormat.class);
    job.setOutputFormatClass(GeneratorOutputFormat.class);
    job.setOutputKeyClass(Float.class);
    job.setOutputValueClass(SelectorEntry.class);
    // AvroMultipleOutputs.addNamedOutput(job, "seq",
    // AvroPairOutputFormat.class, Float.class, SelectorEntry.class);
    try {
        job.waitForCompletion(true);
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }

    // read the subdirectories generated in the temp
    // output and turn them into segments
    List<Path> generatedSegments = new ArrayList<Path>();

    FileStatus[] status = fs.listStatus(tempDir);
    try {
        for (FileStatus stat : status) {
            Path subfetchlist = stat.getPath();
            if (!subfetchlist.getName().startsWith("fetchlist-"))
                continue;
            // start a new partition job for this segment
            Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);

            fs.createNewFile(new Path(newSeg, "generatored"));
            generatedSegments.add(newSeg);
        }
    } catch (Exception e) {
        LOG.warn("Generator: exception while partitioning segments, exiting ...");
        fs.delete(tempDir, true);
        return null;
    }

    if (generatedSegments.size() == 0) {
        LOG.warn("Generator: 0 records selected for fetching, exiting ...");
        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);
        return null;
    }

    if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
        // update the db from tempDir
        Path tempDir2 = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        job = AvroJob.getAvroJob(getConf());
        job.setJobName("generate: updatedb " + dbDir);
        job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime);
        for (Path segmpaths : generatedSegments) {
            Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
            FileInputFormat.addInputPath(job, subGenDir);
        }
        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormatClass(AvroPairInputFormat.class);
        job.setMapperClass(CrawlDbUpdateMapper.class);
        // job.setReducerClass(CrawlDbUpdater.class);
        job.setOutputFormatClass(AvroMapOutputFormat.class);
        job.setOutputKeyClass(String.class);
        job.setOutputValueClass(CrawlDatum.class);
        FileOutputFormat.setOutputPath(job, tempDir2);
        try {
            job.waitForCompletion(true);
            CrawlDb.install(job, dbDir);
        } catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            fs.delete(tempDir2, true);
            throw e;
        }
        fs.delete(tempDir2, true);
    }

    LockUtil.removeLockFile(fs, lock);
    fs.delete(tempDir, true);

    if (LOG.isInfoEnabled()) {
        LOG.info("Generator: done.");
    }
    Path[] patharray = new Path[generatedSegments.size()];
    return generatedSegments.toArray(patharray);
}

From source file:com.iflytek.spider.parse.ParseSegment.java

License:Apache License

public int run(String[] args) throws Exception {

    String usage = "Usage: ParseSegment segments";

    if (args.length == 0) {
        System.err.println(usage);
        System.exit(-1);//from w  w w. j  a  v  a2s .c  o  m
    }
    FileSystem fs = FileSystem.get(getConf());
    for (FileStatus p : fs.listStatus(new Path(args[0]))) {
        if (fs.exists(new Path(p.getPath(), "crawl_parse")))
            fs.delete(new Path(p.getPath(), "crawl_parse"), true);
        if (fs.exists(new Path(p.getPath(), "parse_data")))
            fs.delete(new Path(p.getPath(), "parse_data"), true);
        parse(p.getPath());
    }
    return 0;
}

From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedCombineFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSizeNode = 0;
    long minSizeRack = 0;
    long maxSize = 0;
    Configuration conf = job.getConfiguration();

    // the values specified by setxxxSplitSize() takes precedence over the
    // values that might have been specified in the config
    if (minSplitSizeNode != 0) {
        minSizeNode = minSplitSizeNode;// www  .  java 2  s .c  om
    } else {
        minSizeNode = conf.getLong(SPLIT_MINSIZE_PERNODE, 0);
    }
    if (minSplitSizeRack != 0) {
        minSizeRack = minSplitSizeRack;
    } else {
        minSizeRack = conf.getLong(SPLIT_MINSIZE_PERRACK, 0);
    }
    if (maxSplitSize != 0) {
        maxSize = maxSplitSize;
    } else {
        maxSize = conf.getLong("mapreduce.input.fileinputformat.split.maxsize", 0);
        // If maxSize is not configured, a single split will be generated per
        // node.
    }
    if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
        throw new IOException("Minimum split size pernode " + minSizeNode
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
        throw new IOException("Minimum split size per rack " + minSizeRack
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && minSizeNode > minSizeRack) {
        throw new IOException("Minimum split size per node " + minSizeNode
                + " cannot be larger than minimum split " + "size per rack " + minSizeRack);
    }

    // all the files in input set
    List<FileStatus> stats = listStatus(job);
    List<InputSplit> splits = new ArrayList<InputSplit>();
    if (stats.size() == 0) {
        return splits;
    }

    // In one single iteration, process all the paths in a single pool.
    // Processing one pool at a time ensures that a split contains paths
    // from a single pool only.
    for (MultiPathFilter onepool : pools) {
        ArrayList<FileStatus> myPaths = new ArrayList<FileStatus>();

        // pick one input path. If it matches all the filters in a pool,
        // add it to the output set
        for (Iterator<FileStatus> iter = stats.iterator(); iter.hasNext();) {
            FileStatus p = iter.next();
            if (onepool.accept(p.getPath())) {
                myPaths.add(p); // add it to my output set
                iter.remove();
            }
        }
        // create splits for all files in this pool.
        getMoreSplits(job, myPaths, maxSize, minSizeNode, minSizeRack, splits);
    }

    // create splits for all files that are not in any pool.
    getMoreSplits(job, stats, maxSize, minSizeNode, minSizeRack, splits);

    // free up rackToNodes map
    rackToNodes.clear();
    return splits;
}

From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedCombineFileInputFormat.java

License:Apache License

/**
 * Return all the splits in the specified set of paths
 *//*from   w  ww.  jav  a2  s  .  co m*/
private void getMoreSplits(JobContext job, List<FileStatus> stats, long maxSize, long minSizeNode,
        long minSizeRack, List<InputSplit> splits) throws IOException {
    Configuration conf = job.getConfiguration();

    // all blocks for all the files in input set
    OneFileInfo[] files;

    // mapping from a rack name to the list of blocks it has
    HashMap<String, List<OneBlockInfo>> rackToBlocks = new HashMap<String, List<OneBlockInfo>>();

    // mapping from a block to the nodes on which it has replicas
    HashMap<OneBlockInfo, String[]> blockToNodes = new HashMap<OneBlockInfo, String[]>();

    // mapping from a node to the list of blocks that it contains
    HashMap<String, Set<OneBlockInfo>> nodeToBlocks = new HashMap<String, Set<OneBlockInfo>>();

    files = new OneFileInfo[stats.size()];
    if (stats.size() == 0) {
        return;
    }

    // populate all the blocks for all files
    long totLength = 0;
    int i = 0;
    for (FileStatus stat : stats) {
        files[i] = new OneFileInfo(stat, conf, isSplitable(job, stat.getPath()), rackToBlocks, blockToNodes,
                nodeToBlocks, rackToNodes, maxSize);
        totLength += files[i].getLength();
    }
    createSplits(nodeToBlocks, blockToNodes, rackToBlocks, totLength, maxSize, minSizeNode, minSizeRack,
            splits);
}

From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedFileInputFormat.java

License:Apache License

private List<FileStatus> singleThreadedListStatus(JobContext job, Path[] dirs, PathFilter inputFilter,
        boolean recursive) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();
    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];//w w  w.  ja v  a  2s .  co m
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDirectory()) {
                    RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(globStat.getPath());
                    while (iter.hasNext()) {
                        LocatedFileStatus stat = iter.next();
                        if (inputFilter.accept(stat.getPath())) {
                            if (recursive && stat.isDirectory()) {
                                addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
                            } else {
                                result.add(stat);
                            }
                        }
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    return result;
}

From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedFileInputFormat.java

License:Apache License

/** 
 * Generate the list of files and make them into FileSplits.
 * @param job the job context//from w ww.  j  a v a  2s .  c  om
 * @throws IOException
 */
public List<InputSplit> getSplits(JobContext job) throws IOException {
    Stopwatch sw = Stopwatch.createStarted();
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            BlockLocation[] blkLocations;
            if (file instanceof LocatedFileStatus) {
                blkLocations = ((LocatedFileStatus) file).getBlockLocations();
            } else {
                FileSystem fs = path.getFileSystem(job.getConfiguration());
                blkLocations = fs.getFileBlockLocations(file, 0, length);
            }
            if (isSplitable(job, path)) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, splitSize,
                            blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                    bytesRemaining -= splitSize;
                }

                if (bytesRemaining != 0) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
                            blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                }
            } else { // not splitable
                splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(),
                        blkLocations[0].getCachedHosts()));
            }
        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    // Save the number of input files for metrics/loadgen
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: "
                + sw.elapsed(TimeUnit.MILLISECONDS));
    }
    return splits;
}

From source file:com.ikanow.aleph2.core.shared.utils.DirUtils.java

License:Apache License

/** This method returns the path to the first subdirectory matching the subDirectoryName parameter or null if not found.
* @param fileContext//from   w  w  w  .j ava  2  s. co m
* @param start
* @param subDirectoryName
* @return
*/
public static Path findOneSubdirectory(FileContext fileContext, Path start, String subDirectoryName) {
    Path p = null;
    try {
        logger.debug("findOneSubdirectory :" + start.toString());
        FileStatus[] statuss = fileContext.util().listStatus(start);
        for (int i = 0; i < statuss.length; i++) {
            FileStatus dir = statuss[i];
            logger.debug("FileStatus:" + statuss[i].getPath().toString());
            if (dir.isDirectory()) {
                if (dir.getPath().getName().contains(subDirectoryName)) {
                    logger.debug("findOneSubdirectory match:" + dir.getPath().getName());
                    return dir.getPath();
                } else {
                    p = findOneSubdirectory(fileContext, dir.getPath(), subDirectoryName);
                    if (p != null) {
                        return p;
                    }
                }
            }
        }

    } catch (Exception e) {
        logger.error("findOneSubdirectory Caught Exception", e);
    }

    return p;
}