Example usage for org.apache.hadoop.fs Path toString

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path toString.

Prototype

@Override
    public String toString()

Source Link

Usage

From source file:com.alexholmes.hadooputils.combine.avro.mapreduce.CombineAvroKeyValueInputFormatTest.java

License:Apache License

@Test
public void testKeyValueInput() throws ClassNotFoundException, IOException, InterruptedException {
    // Create a test input file.
    File inputFile = createInputFile();

    // Configure the job input.
    Job job = new Job();
    FileInputFormat.setInputPaths(job, new Path(inputFile.getAbsolutePath()));
    job.setInputFormatClass(CombineAvroKeyValueInputFormat.class);
    AvroJob.setInputKeySchema(job, Schema.create(Schema.Type.INT));
    AvroJob.setInputValueSchema(job, Schema.create(Schema.Type.STRING));

    // Configure a mapper.
    job.setMapperClass(IndexMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    // Configure a reducer.
    job.setReducerClass(IndexReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(AvroValue.class);
    AvroJob.setOutputValueSchema(job, Schema.createArray(Schema.create(Schema.Type.INT)));

    // Configure the output format.
    job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
    Path outputPath = new Path(mTempDir.getRoot().getPath(), "out-index");
    FileOutputFormat.setOutputPath(job, outputPath);

    // Run the job.
    assertTrue(job.waitForCompletion(true));

    // Verify that the output Avro container file as the expected data.
    File avroFile = new File(outputPath.toString(), "part-r-00000.avro");
    DatumReader<GenericRecord> datumReader = new SpecificDatumReader<GenericRecord>(AvroKeyValue
            .getSchema(Schema.create(Schema.Type.STRING), Schema.createArray(Schema.create(Schema.Type.INT))));
    DataFileReader<GenericRecord> avroFileReader = new DataFileReader<GenericRecord>(avroFile, datumReader);
    assertTrue(avroFileReader.hasNext());

    AvroKeyValue<CharSequence, List<Integer>> appleRecord = new AvroKeyValue<CharSequence, List<Integer>>(
            avroFileReader.next());/*  w ww  .j  a v a 2  s. co m*/
    assertNotNull(appleRecord.get());
    assertEquals("apple", appleRecord.getKey().toString());
    List<Integer> appleDocs = appleRecord.getValue();
    assertEquals(3, appleDocs.size());
    assertTrue(appleDocs.contains(1));
    assertTrue(appleDocs.contains(2));
    assertTrue(appleDocs.contains(3));

    assertTrue(avroFileReader.hasNext());
    AvroKeyValue<CharSequence, List<Integer>> bananaRecord = new AvroKeyValue<CharSequence, List<Integer>>(
            avroFileReader.next());
    assertNotNull(bananaRecord.get());
    assertEquals("banana", bananaRecord.getKey().toString());
    List<Integer> bananaDocs = bananaRecord.getValue();
    assertEquals(2, bananaDocs.size());
    assertTrue(bananaDocs.contains(1));
    assertTrue(bananaDocs.contains(2));

    assertTrue(avroFileReader.hasNext());
    AvroKeyValue<CharSequence, List<Integer>> carrotRecord = new AvroKeyValue<CharSequence, List<Integer>>(
            avroFileReader.next());
    assertEquals("carrot", carrotRecord.getKey().toString());
    List<Integer> carrotDocs = carrotRecord.getValue();
    assertEquals(1, carrotDocs.size());
    assertTrue(carrotDocs.contains(1));

    assertFalse(avroFileReader.hasNext());
    avroFileReader.close();
}

From source file:com.alexholmes.hadooputils.sort.Sort.java

License:Apache License

/**
 * The driver for the sort MapReduce job.
 *
 * @param jobConf           sort configuration
 * @param numMapTasks       number of map tasks
 * @param numReduceTasks    number of reduce tasks
 * @param sampler           sampler, if required
 * @param codecClass        the compression codec for compressing final outputs
 * @param mapCodecClass     the compression codec for compressing intermediary map outputs
 * @param createLzopIndexes whether or not a MR job should be launched to create LZOP indexes
 *                          for the job output files
 * @param inputDirAsString  input directory in CSV-form
 * @param outputDirAsString output directory
 * @return true if the job completed successfully
 * @throws IOException        if something went wrong
 * @throws URISyntaxException if a URI wasn't correctly formed
 *//*w  w  w  . j  av a 2  s. c o m*/
public boolean runJob(final JobConf jobConf, final Integer numMapTasks, final Integer numReduceTasks,
        final InputSampler.Sampler<K, V> sampler, final Class<? extends CompressionCodec> codecClass,
        final Class<? extends CompressionCodec> mapCodecClass, final boolean createLzopIndexes,
        final String inputDirAsString, final String outputDirAsString) throws IOException, URISyntaxException {

    jobConf.setJarByClass(Sort.class);
    jobConf.setJobName("sorter");

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();

    if (numMapTasks != null) {
        jobConf.setNumMapTasks(numMapTasks);
    }
    if (numReduceTasks != null) {
        jobConf.setNumReduceTasks(numReduceTasks);
    } else {
        int numReduces = (int) (cluster.getMaxReduceTasks() * 0.9);
        String sortReduces = jobConf.get("test.sort.reduces_per_host");
        if (sortReduces != null) {
            numReduces = cluster.getTaskTrackers() * Integer.parseInt(sortReduces);
        }

        // Set user-supplied (possibly default) job configs
        jobConf.setNumReduceTasks(numReduces);
    }

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(SortReduce.class);

    jobConf.setInputFormat(SortInputFormat.class);

    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);

    if (mapCodecClass != null) {
        jobConf.setMapOutputCompressorClass(mapCodecClass);
    }

    if (codecClass != null) {
        jobConf.setBoolean("mapred.output.compress", true);
        jobConf.setClass("mapred.output.compression.codec", codecClass, CompressionCodec.class);
    }

    FileInputFormat.setInputPaths(jobConf, inputDirAsString);
    FileOutputFormat.setOutputPath(jobConf, new Path(outputDirAsString));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];

        FileSystem fileSystem = FileSystem.get(jobConf);

        if (fileSystem.exists(inputDir) && fileSystem.isFile(inputDir)) {
            inputDir = inputDir.getParent();
        }
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + jobConf.getNumReduceTasks() + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println("The job took "
            + TimeUnit.MILLISECONDS.toSeconds(endTime.getTime() - startTime.getTime()) + " seconds.");

    if (jobResult.isSuccessful()) {
        if (createLzopIndexes && codecClass != null && LzopCodec.class.equals(codecClass)) {
            new LzoIndexer(jobConf).index(new Path(outputDirAsString));
        }
        return true;
    }
    return false;
}

From source file:com.alexholmes.hdfsslurper.FileSystemManager.java

License:Apache License

public Path getStagingFile(FileStatus srcFileStatus, Path destFile) {
    int hash = Math.abs(
            (srcFileStatus.getPath().toString() + destFile.toString()).hashCode() + new Random().nextInt());
    return new Path(config.getDestStagingDir(), String.valueOf(hash));
}

From source file:com.alexholmes.hdfsslurper.WorkerThread.java

License:Apache License

private void process(FileStatus srcFileStatus) throws IOException, InterruptedException {

    Path stagingFile = null;/*from w  w w. ja va2  s.  com*/
    FileSystem destFs = null;
    String filenameBatchidDelimiter = config.getFileNameBatchIdDelimiter();

    try {
        FileSystem srcFs = srcFileStatus.getPath().getFileSystem(config.getConfig());

        // run a script which can change the name of the file as well as
        // write out a new version of the file
        //
        if (config.getWorkScript() != null) {
            Path newSrcFile = stageSource(srcFileStatus);
            srcFileStatus = srcFileStatus.getPath().getFileSystem(config.getConfig()).getFileStatus(newSrcFile);
        }

        Path srcFile = srcFileStatus.getPath();

        // get the target HDFS file
        //
        Path destFile = getHdfsTargetPath(srcFileStatus);

        if (config.getCodec() != null) {
            String ext = config.getCodec().getDefaultExtension();
            if (!destFile.getName().endsWith(ext)) {
                destFile = new Path(destFile.toString() + ext);
            }
        }

        destFs = destFile.getFileSystem(config.getConfig());

        // get the staging HDFS file
        //
        stagingFile = fileSystemManager.getStagingFile(srcFileStatus, destFile);
        String batchId = srcFile.toString().substring(
                srcFile.toString().lastIndexOf(filenameBatchidDelimiter) + 1, srcFile.toString().length());

        log.info("event#Copying source file '" + srcFile + "' to staging destination '" + stagingFile + "'"
                + "$batchId#" + batchId);

        // if the directory of the target file doesn't exist, attempt to
        // create it
        //
        Path destParentDir = destFile.getParent();
        if (!destFs.exists(destParentDir)) {
            log.info("event#Attempting creation of target directory: " + destParentDir.toUri());
            if (!destFs.mkdirs(destParentDir)) {
                throw new IOException("event#Failed to create target directory: " + destParentDir.toUri());
            }
        }

        // if the staging directory doesn't exist, attempt to create it
        //
        Path destStagingParentDir = stagingFile.getParent();
        if (!destFs.exists(destStagingParentDir)) {
            log.info("event#Attempting creation of staging directory: " + destStagingParentDir.toUri());
            if (!destFs.mkdirs(destStagingParentDir)) {
                throw new IOException("event#Failed to create staging directory: " + destParentDir.toUri());
            }
        }

        // copy the file
        //
        InputStream is = null;
        OutputStream os = null;
        CRC32 crc = new CRC32();
        try {
            is = new BufferedInputStream(srcFs.open(srcFile));
            if (config.isVerify()) {
                is = new CheckedInputStream(is, crc);
            }
            os = destFs.create(stagingFile);

            if (config.getCodec() != null) {
                os = config.getCodec().createOutputStream(os);
            }

            IOUtils.copyBytes(is, os, 4096, false);
        } finally {
            IOUtils.closeStream(is);
            IOUtils.closeStream(os);
        }

        long srcFileSize = srcFs.getFileStatus(srcFile).getLen();
        long destFileSize = destFs.getFileStatus(stagingFile).getLen();
        if (config.getCodec() == null && srcFileSize != destFileSize) {
            throw new IOException(
                    "event#File sizes don't match, source = " + srcFileSize + ", dest = " + destFileSize);
        }

        log.info("event#Local file size = " + srcFileSize + ", HDFS file size = " + destFileSize + "$batchId#"
                + batchId);

        if (config.isVerify()) {
            verify(stagingFile, crc.getValue());
        }

        if (destFs.exists(destFile)) {
            destFs.delete(destFile, false);
        }

        log.info("event#Moving staging file '" + stagingFile + "' to destination '" + destFile + "'"
                + "$batchId#" + batchId);
        if (!destFs.rename(stagingFile, destFile)) {
            throw new IOException("event#Failed to rename file");
        }

        if (config.isCreateLzopIndex() && destFile.getName().endsWith(lzopExt)) {
            Path lzoIndexPath = new Path(destFile.toString() + LzoIndex.LZO_INDEX_SUFFIX);
            if (destFs.exists(lzoIndexPath)) {
                log.info("event#Deleting index file as it already exists");
                destFs.delete(lzoIndexPath, false);
            }
            indexer.index(destFile);
        }

        fileSystemManager.fileCopyComplete(srcFileStatus);

    } catch (Throwable t) {
        log.error("event#Caught exception working on file " + srcFileStatus.getPath(), t);

        // delete the staging file if it still exists
        //
        try {
            if (destFs != null && destFs.exists(stagingFile)) {
                destFs.delete(stagingFile, false);
            }
        } catch (Throwable t2) {
            log.error("event#Failed to delete staging file " + stagingFile, t2);
        }

        fileSystemManager.fileCopyError(srcFileStatus);
    }

}

From source file:com.alexholmes.hdfsslurper.WorkerThread.java

License:Apache License

private Path stageSource(FileStatus srcFile) throws IOException {
    String filenameBatchidDelimiter = config.getFileNameBatchIdDelimiter();
    Path p = new Path(ScriptExecutor.getStdOutFromScript(config.getWorkScript(), srcFile.getPath().toString(),
            60, TimeUnit.SECONDS, config.getFileNameBatchIdDelimiter()));
    String batchId = p.toString().substring(p.toString().lastIndexOf(filenameBatchidDelimiter) + 1,
            p.toString().length());/*from   w w  w. j  av  a2  s  .c  o m*/
    if (p.toUri().getScheme() == null) {
        throw new IOException(
                "event#Work path from script must be a URI with a scheme: '" + p + "'" + "$batchId#" + batchId);
    }
    log.info("event#Staging script returned new file '" + p + " for old " + srcFile.getPath() + "$batchId#"
            + batchId);
    return p;
}

From source file:com.alexholmes.hdfsslurper.WorkerThread.java

License:Apache License

private Path getDestPathFromScript(FileStatus srcFile) throws IOException {
    Path p = new Path(ScriptExecutor.getStdOutFromScript(config.getScript(), srcFile.getPath().toString(), 60,
            TimeUnit.SECONDS, config.getFileNameBatchIdDelimiter()));
    String filenameBatchidDelimiter = config.getFileNameBatchIdDelimiter();
    String batchId = p.toString().substring(p.toString().lastIndexOf(filenameBatchidDelimiter) + 1,
            p.toString().length());/* ww w .  j a  va  2 s  .  com*/
    if (p.toUri().getScheme() == null) {
        throw new IOException("event#Destination path from script must be a URI with a scheme: '" + p + "'"
                + "$batchId#" + batchId);
    }
    return p;
}

From source file:com.alibaba.jstorm.hdfs.spout.DirLock.java

License:Apache License

private DirLock(FileSystem fs, Path lockFile) throws IOException {
    if (fs.isDirectory(lockFile)) {
        throw new IllegalArgumentException(lockFile.toString() + " is not a directory");
    }//from   w  w w  .  j a v  a 2  s . c o  m
    this.fs = fs;
    this.lockFile = lockFile;
}

From source file:com.alibaba.jstorm.hdfs.spout.DirLock.java

License:Apache License

private static Path getDirLockFile(Path dir) {
    return new Path(dir.toString() + Path.SEPARATOR_CHAR + DIR_LOCK_FILE);
}

From source file:com.alibaba.jstorm.hdfs.spout.HdfsSpout.java

License:Apache License

private void markFileAsBad(Path file) {
    String fileName = file.toString();
    String fileNameMinusSuffix = fileName.substring(0, fileName.indexOf(inprogress_suffix));
    String originalName = new Path(fileNameMinusSuffix).getName();
    Path newFile = new Path(badFilesDirPath + Path.SEPARATOR + originalName);

    LOG.info("Moving bad file {} to {}. Processed it till offset {}. SpoutID= {}", originalName, newFile,
            tracker.getCommitPosition(), spoutId);
    try {/*w  w w .j  av a  2s .c om*/
        if (!hdfs.rename(file, newFile)) { // seems this can fail by returning false or throwing exception
            throw new IOException("Move failed for bad file: " + file); // convert false ret value to exception
        }
    } catch (IOException e) {
        LOG.warn("Error moving bad file: " + file + " to destination " + newFile + " SpoutId =" + spoutId, e);
    }
    closeReaderAndResetTrackers();
}

From source file:com.alibaba.jstorm.hdfs.spout.HdfsSpout.java

License:Apache License

private String getDefaultLockDir(Path sourceDirPath) {
    return sourceDirPath.toString() + Path.SEPARATOR + Configs.DEFAULT_LOCK_DIR;
}