Example usage for org.apache.hadoop.fs Path toString

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path toString.

Prototype

@Override
    public String toString()

Source Link

Usage

From source file:com.clustertest2.clustertest2.vectorization.MRDocVectorizer.java

/**
  * Generate tf-idf weighted sparse vectors for documents stored in SquenceFile format.
  * Run in parallel using MapReduce.// www.j  a  va 2  s.com
  * Tokenizes the documents, and calculates their tf-idf weights. Also creates a dictionary file
  * for the document corpus.
  * Overwrites any previous results in outputDir.
  */
public void generateVectorsFromSeqFiles(Path inputDir, Path outputDir, int minSupport, Analyzer analyzer,
        int chunkSize, int minDF, long maxDFPercent, int minLLR, int numReducers, int norm, int maxNGramSize,
        boolean sequentialAccessOutput, boolean namedVectorOutput) throws Exception {
    try {
        numThreads.set(0);
        String[] args = new String[] { "-Dmapred.child.java.opts=-Xmx1024", "-Dmapred.child.ulimit=2097152",
                "-Dio.sort.mb=256", "--input", inputDir.toString(), "--output", outputDir.toString(),
                "--minSupport", Integer.toString(minSupport), "--analyzerName", analyzer.getClass().getName(),
                "--chunkSize", Integer.toString(chunkSize), "--minDF", Integer.toString(minDF),
                "--maxDFPercent", Long.toString(maxDFPercent), "--minLLR", Integer.toString(minLLR),
                "--numReducers", Integer.toString(numReducers), "--norm", Integer.toString(norm),
                "--maxNGramSize", Integer.toString(maxNGramSize),
                sequentialAccessOutput ? "--sequentialAccessVector" : "",
                namedVectorOutput ? "--namedVector" : "", "--overwrite" };
        vectorizer.addWork(args);
        numThreads.incrementAndGet();
    } catch (Exception e) {
        System.out.println(e.getClass());
    } finally {
        while (numThreads.get() != 0) {
            Thread.sleep(1000);
        }
    }
}

From source file:com.collective.celos.ci.testing.fixtures.deploy.HdfsInputDeployer.java

License:Apache License

@Override
public void deploy(TestRun testRun) throws Exception {
    FileSystem fileSystem = testRun.getCiContext().getFileSystem();

    CollectFilesAndPathsProcessor pathToFile = new CollectFilesAndPathsProcessor();
    TreeObjectProcessor.process(fixObjectCreator.create(testRun), pathToFile);

    Path pathPrefixed = new Path(Util.augmentHdfsPath(testRun.getHdfsPrefix(), path.toString()));
    for (java.nio.file.Path childPath : pathToFile.pathToFiles.keySet()) {
        Path pathTo = new Path(pathPrefixed, childPath.toString());
        fileSystem.mkdirs(pathTo.getParent());

        FSDataOutputStream outputStream = fileSystem.create(pathTo);
        try {/*from   w w  w  .  j  a v  a2s . c o  m*/
            IOUtils.copy(pathToFile.pathToFiles.get(childPath).getContent(), outputStream);
        } finally {
            outputStream.flush();
            outputStream.close();
        }

    }
}

From source file:com.collective.celos.ci.testing.fixtures.deploy.hive.HiveTableDeployer.java

License:Apache License

private void loadDataToMockedTable(Statement statement, String mockedDatabase, Path dataFile, String tableName)
        throws SQLException, IOException {

    List<String> columnDef = Lists.newArrayList();
    List<String> partDef = Lists.newArrayList();
    parseTableDefinition(columnDef, partDef, mockedDatabase, statement);

    String createMockedTbl = String.format(CREATE_TEMP_TABLE_PATTERN, mockedDatabase, tableName,
            StringUtils.join(columnDef, ",\n"));
    statement.executeUpdate(createMockedTbl);

    String loadDataTmp = String.format(LOAD_TMP_DATA_PATTERN, dataFile.toString(), mockedDatabase, tableName);
    statement.executeUpdate(loadDataTmp);

    loadFromTempToRealDb(statement, tableName, partDef);
}

From source file:com.collective.celos.trigger.HDFSCheckTrigger.java

License:Apache License

private String humanReadableDescription(boolean ready, Path path) {
    if (ready) {// w w w . ja  v  a  2 s. c o  m
        return "HDFS path " + path.toString() + " is ready";
    } else {
        return "HDFS path " + path.toString() + " is not ready";
    }
}

From source file:com.conductor.hadoop.WritableValueInputFormat.java

License:Apache License

@VisibleForTesting
static <V extends Writable> void doSetupInput(final List<V> values, final Class<V> clazz,
        final int inputsPerSplit, final Job job, final Path inputPath, final Writer writer) throws IOException {
    job.getConfiguration().setClass(VALUE_TYPE_CONF, clazz, Writable.class);
    job.getConfiguration().setInt(INPUTS_PER_SPLIT_CONF, inputsPerSplit);
    job.getConfiguration().set(INPUT_FILE_LOCATION_CONF, inputPath.toString());

    // write each value to the sequence file
    int syncCounter = 0;
    for (final V input : values) {
        // each entry in the sequence file is a map input
        writer.append(NullWritable.get(), input);
        // syncing indicates an input split boundary
        if (++syncCounter % inputsPerSplit == 0) {
            writer.sync();/*from w ww .j av  a  2 s  .c  o m*/
        }
    }
    // close the input file
    writer.hflush();
    writer.close();

    // delete file when JVM exits
    inputPath.getFileSystem(job.getConfiguration()).deleteOnExit(inputPath);
}

From source file:com.conductor.s3.S3InputFormatUtils.java

License:Apache License

/**
 * Efficiently gets the Hadoop {@link org.apache.hadoop.fs.FileStatus} for all S3 files under the provided
 * {@code dirs}/*from www. ja va  2 s.  c  o  m*/
 * 
 * @param s3Client
 *            s3 client
 * @param blockSize
 *            the block size
 * @param dirs
 *            the dirs to search through
 * @return the {@link org.apache.hadoop.fs.FileStatus} version of all S3 files under {@code dirs}
 */
static List<FileStatus> getFileStatuses(final AmazonS3 s3Client, final long blockSize, final Path... dirs) {
    final List<FileStatus> result = Lists.newArrayList();
    for (final Path dir : dirs) {
        // get bucket and prefix from path
        final String bucket = S3HadoopUtils.getBucketFromPath(dir.toString());
        final String prefix = S3HadoopUtils.getKeyFromPath(dir.toString());
        // list request
        final ListObjectsRequest req = new ListObjectsRequest().withMaxKeys(Integer.MAX_VALUE)
                .withBucketName(bucket).withPrefix(prefix);
        // recursively page through all objects under the path
        for (ObjectListing listing = s3Client.listObjects(req); listing.getObjectSummaries()
                .size() > 0; listing = s3Client.listNextBatchOfObjects(listing)) {
            for (final S3ObjectSummary summary : listing.getObjectSummaries()) {
                final Path path = new Path(
                        String.format("s3n://%s/%s", summary.getBucketName(), summary.getKey()));
                if (S3_PATH_FILTER.accept(path)) {
                    result.add(new FileStatus(summary.getSize(), false, 1, blockSize,
                            summary.getLastModified().getTime(), path));
                }
            }
            // don't need to check the next listing if this one is not truncated
            if (!listing.isTruncated()) {
                break;
            }
        }
    }
    return result;
}

From source file:com.continuent.tungsten.common.file.HdfsFileIO.java

License:Open Source License

/**
 * {@inheritDoc}//w ww . j a v a  2s  .  c  o  m
 * 
 * @see com.continuent.tungsten.common.file.FileIO#write(com.continuent.tungsten.common.file.FilePath,
 *      java.lang.String, java.lang.String, boolean)
 */

@Override
public void write(FilePath path, String value, String charset, boolean fsync) throws FileIOException {
    // Write the data and flush to storage. This overwrites any
    // previous version.
    Path p = new Path(path.toString());
    FSDataOutputStream os = null;
    try {
        os = (FSDataOutputStream) this.getOutputStream(path);
        BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(os, charset));
        bw.write(value);
        bw.flush();
        if (fsync) {
            os.hsync();
        }
    } catch (IOException e) {
        throw new FileIOException("Unable to write data to file: uri=" + uri + " path=" + p.toString()
                + " value=" + safeSynopsis(value, 20), e);
    } finally {
        if (os != null) {
            try {
                os.close();
            } catch (IOException e) {
            }
        }
    }
}

From source file:com.continuent.tungsten.common.file.HdfsFileIO.java

License:Open Source License

/**
 * {@inheritDoc}/*from www . j a va  2  s .c  o m*/
 * 
 * @see com.continuent.tungsten.common.file.FileIO#getOutputStream(com.continuent.tungsten.common.file.FilePath)
 */
public OutputStream getOutputStream(FilePath path) throws FileIOException {
    Path p = new Path(path.toString());
    try {
        return hdfs.create(p, true);
    } catch (IOException e) {
        throw new FileIOException("Unable to write data to file: uri=" + uri + " path=" + p.toString(), e);
    }
}

From source file:com.couchbase.sqoop.manager.CouchbaseManagerTest.java

License:Apache License

private void runCouchbaseTest(HashMap<String, String> expectedMap) throws IOException {
    Path warehousePath = new Path(this.getWarehouseDir());
    Path tablePath = new Path(warehousePath, TABLE_NAME);
    Path filePath = new Path(tablePath, "part-m-00000");

    File tableFile = new File(tablePath.toString());
    if (tableFile.exists() && tableFile.isDirectory()) {
        // remove the directory before running the import.
        FileListing.recursiveDeleteDir(tableFile);
    }/*from  www . j  a  v a  2s  .  c  o  m*/

    String[] argv = getArgv();
    try {
        runImport(argv);
    } catch (IOException ioe) {
        LOG.error("Got IOException during import: " + ioe.toString());
        ioe.printStackTrace();
        fail(ioe.toString());
    }

    File f = new File(filePath.toString());
    assertTrue("Could not find imported data file", f.exists());
    BufferedReader r = null;
    try {
        // Read through the file and make sure it's all there.
        r = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
        String line;
        int records = 0;
        while ((line = r.readLine()) != null) {
            compareRecords(expectedMap, line);
            records++;
        }
        if (records < NUM_RECORDS) {
            fail("Not everything was imported. Got " + records + "/" + NUM_RECORDS + " records.");
        }
    } catch (IOException ioe) {
        LOG.error("Got IOException verifying results: " + ioe.toString());
        ioe.printStackTrace();
        fail(ioe.toString());
    } finally {
        IOUtils.closeStream(r);
    }
}

From source file:com.datasalt.pangool.examples.avro.AvroTweetsJoin.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        failArguments("Wrong number of arguments");
        return -1;
    }//w w  w. j a va 2s . c  o m
    Path tweetsPath = new Path(args[0]);
    Path retweetsPath = new Path(args[1]);
    Path outputPath = new Path(args[2]);
    delete(outputPath.toString());

    TupleMRBuilder mr = new TupleMRBuilder(conf, "AvroTweetsJoin");
    mr.addIntermediateSchema(getPangoolTweetSchema());
    mr.addIntermediateSchema(getPangoolRetweetSchema());
    mr.setGroupByFields("tweet_id");
    mr.setOrderBy(new OrderBy().add("tweet_id", Order.ASC).addSchemaOrder(Order.ASC));
    mr.setSpecificOrderBy("retweet", new OrderBy().add("username", Order.ASC));

    mr.addInput(tweetsPath, new AvroInputFormat<Record>(getAvroTweetSchema()), new TweetsMapper());
    mr.addInput(retweetsPath, new HadoopInputFormat(TextInputFormat.class), new RetweetsMapper());
    mr.setOutput(outputPath, new AvroOutputFormat<Record>(getAvroOutputSchema()), AvroWrapper.class,
            NullWritable.class);

    mr.setTupleReducer(new Red());

    try {
        Job job = mr.createJob();
        job.waitForCompletion(true);
    } finally {
        mr.cleanUpInstanceFiles();
    }

    return 0;
}