Example usage for org.apache.hadoop.fs Path getName

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getName.

Prototype

public String getName()

Source Link

Document

Returns the final component of this path.

Usage

From source file:com.talis.hadoop.rdf.ZipUtils.java

License:Apache License

public static void packZipFile(Path perm, Path temp, Configuration conf, FileSystem fs) throws IOException {
    FSDataOutputStream out = null;//www . j a v a  2s  .c  om
    ZipOutputStream zos = null;
    int zipCount = 0;
    LOG.info("Packing zip file for " + perm);
    try {
        out = fs.create(perm, false);
        zos = new ZipOutputStream(out);
        String name = perm.getName().replaceAll(".zip$", "");
        LOG.info("adding index directory" + temp);
        zipCount = zipDirectory(conf, zos, "", temp.toString(), temp);
    } catch (Throwable t) {
        LOG.error("packZipFile exception", t);
        if (t instanceof RuntimeException) {
            throw (RuntimeException) t;
        }
        if (t instanceof IOException) {
            throw (IOException) t;
        }
        throw new IOException(t);
    } finally {
        if (zos != null) {
            if (zipCount == 0) { // If no entries were written, only close out, as
                // the zip will throw an error
                LOG.error("No entries written to zip file " + perm);
                fs.delete(perm, false);
                // out.close();
            } else {
                LOG.info(String.format("Wrote %d items to %s for %s", zipCount, perm, temp));
                zos.close();
            }
        }
    }
}

From source file:com.talis.mapreduce.dicenc.ThirdDriver.java

License:Apache License

@Override
public boolean accept(Path path) {
    return !path.getName().startsWith("dict");
}

From source file:com.tamingtext.util.SplitInput.java

License:Apache License

/** Perform a split on the specified input file. Results will be written to files of the same name in the specified 
 *  training and test output directories. The {@link #validate()} method is called prior to executing the split.
 *//*from   w w w . j a va  2s  .  c  o m*/
public void splitFile(Path inputFile) throws IOException {
    if (fs.getFileStatus(inputFile) == null) {
        throw new IOException(inputFile + " does not exist");
    } else if (fs.getFileStatus(inputFile).isDir()) {
        throw new IOException(inputFile + " is a directory");
    }

    validate();

    Path testOutputFile = new Path(testOutputDirectory, inputFile.getName());
    Path trainingOutputFile = new Path(trainingOutputDirectory, inputFile.getName());

    int lineCount = countLines(fs, inputFile, charset);

    log.info("{} has {} lines", inputFile.getName(), lineCount);

    int testSplitStart = 0;
    int testSplitSize = this.testSplitSize; // don't modify state
    BitSet randomSel = null;

    if (testRandomSelectionPct > 0 || testRandomSelectionSize > 0) {
        testSplitSize = this.testRandomSelectionSize;

        if (testRandomSelectionPct > 0) {
            testSplitSize = Math.round(lineCount * (testRandomSelectionPct / 100.0f));
        }
        log.info("{} test split size is {} based on random selection percentage {}",
                new Object[] { inputFile.getName(), testSplitSize, testRandomSelectionPct });
        long[] ridx = new long[testSplitSize];
        RandomSampler.sample(testSplitSize, lineCount - 1, testSplitSize, 0, ridx, 0, RandomUtils.getRandom());
        randomSel = new BitSet(lineCount);
        for (long idx : ridx) {
            randomSel.set((int) idx + 1);
        }
    } else {
        if (testSplitPct > 0) { // calculate split size based on percentage
            testSplitSize = Math.round(lineCount * (testSplitPct / 100.0f));
            log.info("{} test split size is {} based on percentage {}",
                    new Object[] { inputFile.getName(), testSplitSize, testSplitPct });
        } else {
            log.info("{} test split size is {}", inputFile.getName(), testSplitSize);
        }

        if (splitLocation > 0) { // calculate start of split based on percentage
            testSplitStart = Math.round(lineCount * (splitLocation / 100.0f));
            if (lineCount - testSplitStart < testSplitSize) {
                // adjust split start downwards based on split size.
                testSplitStart = lineCount - testSplitSize;
            }
            log.info("{} test split start is {} based on split location {}",
                    new Object[] { inputFile.getName(), testSplitStart, splitLocation });
        }

        if (testSplitStart < 0) {
            throw new IllegalArgumentException(
                    "test split size for " + inputFile + " is too large, it would produce an "
                            + "empty training set from the initial set of " + lineCount + " examples");
        } else if ((lineCount - testSplitSize) < testSplitSize) {
            log.warn(
                    "Test set size for {} may be too large, {} is larger than the number of "
                            + "lines remaining in the training set: {}",
                    new Object[] { inputFile, testSplitSize, lineCount - testSplitSize });
        }
    }

    BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset));
    Writer trainingWriter = new OutputStreamWriter(fs.create(trainingOutputFile), charset);
    Writer testWriter = new OutputStreamWriter(fs.create(testOutputFile), charset);

    int pos = 0;
    int trainCount = 0;
    int testCount = 0;

    String line;
    while ((line = reader.readLine()) != null) {
        pos++;

        Writer writer;
        if (testRandomSelectionPct > 0) { // Randomly choose
            writer = randomSel.get(pos) ? testWriter : trainingWriter;
        } else { // Choose based on location
            writer = pos > testSplitStart ? testWriter : trainingWriter;
        }

        if (writer == testWriter) {
            if (testCount >= testSplitSize) {
                writer = trainingWriter;
            } else {
                testCount++;
            }
        }

        if (writer == trainingWriter) {
            trainCount++;
        }

        writer.write(line);
        writer.write('\n');
    }

    IOUtils.close(Collections.singleton(trainingWriter));
    IOUtils.close(Collections.singleton(testWriter));

    log.info("file: {}, input: {} train: {}, test: {} starting at {}",
            new Object[] { inputFile.getName(), lineCount, trainCount, testCount, testSplitStart });

    // testing;
    if (callback != null) {
        callback.splitComplete(inputFile, lineCount, trainCount, testCount, testSplitStart);
    }
}

From source file:com.taobao.datax.plugins.common.DFSUtils.java

License:Open Source License

/**
 * Delete file specified by path.//from   w  w  w  .j a va  2  s .c  o m
 * 
 * @param path
 *            {@link Path} in hadoop
 * 
 * @param flag
 *            need to do delete recursively
 * @throws IOException 
 * 
 * */
public static void deleteFile(Path path, boolean flag) throws IOException {
    log.debug("deleting:" + path.getName());
    fs.delete(path, flag);
}

From source file:com.taobao.datax.plugins.common.DFSUtils.java

License:Open Source License

/**
 * Delete file specified by path.// w ww .  j  a v a 2  s.  c  o  m
 * 
 * @param dfs
 *            handle of {@link FileSystem}
 * 
 * @param path
 *            {@link Path} in hadoop
 * 
 * @param flag
 *            need to do delete recursively
 * @throws IOException 
 * 
 * */
public static void deleteFile(FileSystem dfs, Path path, boolean flag) throws IOException {
    log.debug("deleting:" + path.getName());
    dfs.delete(path, flag);
}

From source file:com.TCG.Nutch_DNS.Generator.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or not
 * is read from the crawl.generate.filter property in the configuration files.
 * If the property is not found, the URLs are filtered. Same for the
 * normalisation./*from  www.  j  a va  2s  . c  om*/
 * 
 * @param dbDir
 *          Crawl database directory
 * @param segments
 *          Segments directory
 * @param numLists
 *          Number of reduce tasks
 * @param topN
 *          Number of top URLs to be selected
 * @param curTime
 *          Current time in milliseconds
 * 
 * @return Path to generated segment or null if no entries were selected
 * 
 * @throws IOException
 *           When an I/O error occurs
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter,
        boolean norm, boolean force, int maxNumSegments) throws IOException {

    Path tempDir = new Path(
            getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString());

    Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
    FileSystem fs = FileSystem.get(getConf());
    LockUtil.createLockFile(fs, lock, force);

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("Generator: starting at " + sdf.format(start));
    LOG.info("Generator: Selecting best-scoring urls due for fetch.");
    LOG.info("Generator: filtering: " + filter);
    LOG.info("Generator: normalizing: " + norm);
    if (topN != Long.MAX_VALUE) {
        LOG.info("Generator: topN: " + topN);
    }

    // map to inverted subset due for fetch, sort by score
    JobConf job = new NutchJob(getConf());
    job.setJobName("generate: select from " + dbDir);

    if (numLists == -1) { // for politeness make
        numLists = job.getNumMapTasks(); // a partition per fetch task
    }
    if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
        // override
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        numLists = 1;
    }
    job.setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
    job.setLong(GENERATOR_TOP_N, topN);
    job.setBoolean(GENERATOR_FILTER, filter);
    job.setBoolean(GENERATOR_NORMALISE, norm);
    job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments);

    FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(Selector.class);
    job.setPartitionerClass(Selector.class);
    job.setReducerClass(Selector.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
    job.setOutputValueClass(SelectorEntry.class);
    job.setOutputFormat(GeneratorOutputFormat.class);

    try {
        JobClient.runJob(job);
    } catch (IOException e) {
        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);
        throw e;
    }

    // read the subdirectories generated in the temp
    // output and turn them into segments
    List<Path> generatedSegments = new ArrayList<Path>();

    FileStatus[] status = fs.listStatus(tempDir);
    try {
        for (FileStatus stat : status) {
            Path subfetchlist = stat.getPath();
            if (!subfetchlist.getName().startsWith("fetchlist-"))
                continue;
            // start a new partition job for this segment
            Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);
            generatedSegments.add(newSeg);
        }
    } catch (Exception e) {
        LOG.warn("Generator: exception while partitioning segments, exiting ...");
        fs.delete(tempDir, true);
        return null;
    }

    if (generatedSegments.size() == 0) {
        LOG.warn("Generator: 0 records selected for fetching, exiting ...");
        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);
        return null;
    }

    if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
        // update the db from tempDir
        Path tempDir2 = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + UUID.randomUUID().toString());

        job = new NutchJob(getConf());
        job.setJobName("generate: updatedb " + dbDir);
        job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
        for (Path segmpaths : generatedSegments) {
            Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
            FileInputFormat.addInputPath(job, subGenDir);
        }
        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapperClass(CrawlDbUpdater.class);
        job.setReducerClass(CrawlDbUpdater.class);
        job.setOutputFormat(MapFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlDatum.class);
        FileOutputFormat.setOutputPath(job, tempDir2);
        try {
            JobClient.runJob(job);
            CrawlDb.install(job, dbDir);
        } catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            fs.delete(tempDir2, true);
            throw e;
        }
        fs.delete(tempDir2, true);
    }

    LockUtil.removeLockFile(fs, lock);
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));

    Path[] patharray = new Path[generatedSegments.size()];
    return generatedSegments.toArray(patharray);
}

From source file:com.thinkbiganalytics.spark.io.ZipStreamingOutput.java

License:Apache License

/**
 * Creates a new ZIP entry for the specified path.
 *//*  w  w w  . jav  a2s  .c o  m*/
@Nonnull
private ZipEntry createEntry(@Nonnull final Path path) {
    return new ZipEntry(path.getName());
}

From source file:com.toy.Client.java

License:Apache License

private static void registerLocalResource(Map<String, LocalResource> localResources, ApplicationId appId,
        FileSystem fs, Path src) throws IOException {
    String pathSuffix = Constants.TOY_PREFIX + appId.toString() + "/" + src.getName();
    Path dst = new Path(fs.getHomeDirectory(), pathSuffix);
    LOG.info("Copy {} from local filesystem to {} and add to local environment", src.getName(), dst.toUri());
    fs.copyFromLocalFile(false, true, src, dst);
    FileStatus destStatus = fs.getFileStatus(dst);
    LocalResource amJarRsrc = Records.newRecord(LocalResource.class);
    amJarRsrc.setType(LocalResourceType.FILE);
    amJarRsrc.setVisibility(LocalResourceVisibility.APPLICATION);
    amJarRsrc.setResource(ConverterUtils.getYarnUrlFromPath(dst));
    amJarRsrc.setTimestamp(destStatus.getModificationTime());
    amJarRsrc.setSize(destStatus.getLen());
    localResources.put(src.getName(), amJarRsrc);
}

From source file:com.trendmicro.hdfs.webdav.HDFSResource.java

License:Apache License

@Override
public String getHref() {
    StringBuilder sb = new StringBuilder();
    Path p = this.path;
    while (p != null && !("".equals(p.getName()))) {
        sb.insert(0, p.getName());// w  ww  . j  av  a2  s  .  c o  m
        sb.insert(0, "/");
        p = p.getParent();
    }
    if (sb.length() == 0) {
        sb.insert(0, "/");
    }
    return sb.toString();
}

From source file:com.tripadvisor.hadoop.BackupHdfs.java

License:Apache License

/**
 * Method to go though the HDFS filesystem in a DFS to find all
 * files/*from   w w  w .  j a  va  2 s  .  co  m*/
 *
 * fs:FileSystem object from HDFS
 * minDate:      Oldest date for files to be backed up
 * maxDate:Newest date for files to be backed up
 * p:Path in HDFS to look for files
 * pathList:Will be filled with all files in p
 * hmTimestamps: hashmap of timestamps for later sorting
 **/
public void checkDir(FileSystem fs, long minDate, long maxDate, Path p, ArrayList<Path> pathList,
        HashMap<Path, Long> hmTimestamps) {
    long tmpDate;
    FileStatus[] fStat;

    try {
        String sPath = p.toUri().getPath();

        // If this is a directory
        if (fs.getFileStatus(p).isDir()) {
            // ignore certain directories
            if ("dfstmp".equals(p.getName()) || "tmp".equals(p.getName()) || "jobtracker".equals(p.getName())
                    || sPath.startsWith("/mapred") || "ops".equals(p.getName())
                    || p.getName().startsWith("_distcp_logs")) {
                return;
            }

            // dump the mkdir and chmod commands for this
            // directory -- skip root directory only
            {
                FileStatus stat = fs.getFileStatus(p);

                if (!sPath.equals("/")) {
                    m_wrMkdirs.println("hadoop fs -mkdir " + sPath);
                }

                m_wrChmods.println("hadoop fs -chown " + stat.getOwner() + ":" + stat.getGroup() + " " + sPath);

                Short sh = new Short(stat.getPermission().toShort());
                m_wrChmods.println("hadoop fs -chmod " + Long.toOctalString(sh.longValue()) + " " + sPath);
            }

            fStat = fs.listStatus(p);

            // Do a recursive call to all elements
            for (int i = 0; i < fStat.length; i++) {
                checkDir(fs, minDate, maxDate, fStat[i].getPath(), pathList, hmTimestamps);
            }
        } else {
            // If not a directory then we've found a file

            // ignore crc files
            if (p.getName().endsWith(".crc")) {
                return;
            }

            // ignore other files
            if (sPath.startsWith("/user/oozie/etl/workflows/")) {
                return;
            }

            // try to get the table name from the path. There are
            // various types of tables, from those replicated from
            // another database to regular hive tables to
            // partitioned hive tables.  We use table names to
            // both exclude some from the backup, and for the rest
            // to dump out the schema and partition name.
            if (m_ignoreTables != null && m_ignoreTables.doIgnoreFile(sPath)) {
                m_nIgnoredTables++;

                if (m_nIgnoredTables < 5) {
                    System.out.println("Skipping ignore-table file: " + sPath);
                } else if (m_nIgnoredTables == 5) {
                    System.out.println("(...not showing other skipped tables...)");
                }
                return;
            }

            FileStatus stat = fs.getFileStatus(p);

            tmpDate = stat.getModificationTime() / 1000;

            // store the chmods/chowns for all files
            m_wrChmods.println("hadoop fs -chown " + stat.getOwner() + ":" + stat.getGroup() + " " + sPath);

            m_wrChmods.println("hadoop fs -chmod " + stat.getPermission().toShort() + " " + sPath);

            // check dates.  is it too young?
            if (tmpDate < minDate) {
                return;
            }

            // is the file too recent?
            if (tmpDate > maxDate) {
                //System.out.println("file too recent: " + sPath);
                return;
            }

            // file timestamp is ok
            pathList.add(p);

            hmTimestamps.put(p, new Long(tmpDate));

            // store info about total bytes neeed to backup
            m_nTotalBytes += fs.getContentSummary(p).getLength();
        }
    } catch (IOException e) {
        System.err.println("ERROR: could not open " + p + ": " + e);

        // System.exit(1) ;
    }
}