List of usage examples for org.apache.hadoop.fs Path toString
@Override
public String toString()
From source file:com.clustertest2.clustertest2.vectorization.MRDocVectorizer.java
/** * Generate tf-idf weighted sparse vectors for documents stored in SquenceFile format. * Run in parallel using MapReduce.// www.j a va 2 s.com * Tokenizes the documents, and calculates their tf-idf weights. Also creates a dictionary file * for the document corpus. * Overwrites any previous results in outputDir. */ public void generateVectorsFromSeqFiles(Path inputDir, Path outputDir, int minSupport, Analyzer analyzer, int chunkSize, int minDF, long maxDFPercent, int minLLR, int numReducers, int norm, int maxNGramSize, boolean sequentialAccessOutput, boolean namedVectorOutput) throws Exception { try { numThreads.set(0); String[] args = new String[] { "-Dmapred.child.java.opts=-Xmx1024", "-Dmapred.child.ulimit=2097152", "-Dio.sort.mb=256", "--input", inputDir.toString(), "--output", outputDir.toString(), "--minSupport", Integer.toString(minSupport), "--analyzerName", analyzer.getClass().getName(), "--chunkSize", Integer.toString(chunkSize), "--minDF", Integer.toString(minDF), "--maxDFPercent", Long.toString(maxDFPercent), "--minLLR", Integer.toString(minLLR), "--numReducers", Integer.toString(numReducers), "--norm", Integer.toString(norm), "--maxNGramSize", Integer.toString(maxNGramSize), sequentialAccessOutput ? "--sequentialAccessVector" : "", namedVectorOutput ? "--namedVector" : "", "--overwrite" }; vectorizer.addWork(args); numThreads.incrementAndGet(); } catch (Exception e) { System.out.println(e.getClass()); } finally { while (numThreads.get() != 0) { Thread.sleep(1000); } } }
From source file:com.collective.celos.ci.testing.fixtures.deploy.HdfsInputDeployer.java
License:Apache License
@Override public void deploy(TestRun testRun) throws Exception { FileSystem fileSystem = testRun.getCiContext().getFileSystem(); CollectFilesAndPathsProcessor pathToFile = new CollectFilesAndPathsProcessor(); TreeObjectProcessor.process(fixObjectCreator.create(testRun), pathToFile); Path pathPrefixed = new Path(Util.augmentHdfsPath(testRun.getHdfsPrefix(), path.toString())); for (java.nio.file.Path childPath : pathToFile.pathToFiles.keySet()) { Path pathTo = new Path(pathPrefixed, childPath.toString()); fileSystem.mkdirs(pathTo.getParent()); FSDataOutputStream outputStream = fileSystem.create(pathTo); try {/*from w w w . j a v a2s . c o m*/ IOUtils.copy(pathToFile.pathToFiles.get(childPath).getContent(), outputStream); } finally { outputStream.flush(); outputStream.close(); } } }
From source file:com.collective.celos.ci.testing.fixtures.deploy.hive.HiveTableDeployer.java
License:Apache License
private void loadDataToMockedTable(Statement statement, String mockedDatabase, Path dataFile, String tableName) throws SQLException, IOException { List<String> columnDef = Lists.newArrayList(); List<String> partDef = Lists.newArrayList(); parseTableDefinition(columnDef, partDef, mockedDatabase, statement); String createMockedTbl = String.format(CREATE_TEMP_TABLE_PATTERN, mockedDatabase, tableName, StringUtils.join(columnDef, ",\n")); statement.executeUpdate(createMockedTbl); String loadDataTmp = String.format(LOAD_TMP_DATA_PATTERN, dataFile.toString(), mockedDatabase, tableName); statement.executeUpdate(loadDataTmp); loadFromTempToRealDb(statement, tableName, partDef); }
From source file:com.collective.celos.trigger.HDFSCheckTrigger.java
License:Apache License
private String humanReadableDescription(boolean ready, Path path) { if (ready) {// w w w . ja v a 2 s. c o m return "HDFS path " + path.toString() + " is ready"; } else { return "HDFS path " + path.toString() + " is not ready"; } }
From source file:com.conductor.hadoop.WritableValueInputFormat.java
License:Apache License
@VisibleForTesting static <V extends Writable> void doSetupInput(final List<V> values, final Class<V> clazz, final int inputsPerSplit, final Job job, final Path inputPath, final Writer writer) throws IOException { job.getConfiguration().setClass(VALUE_TYPE_CONF, clazz, Writable.class); job.getConfiguration().setInt(INPUTS_PER_SPLIT_CONF, inputsPerSplit); job.getConfiguration().set(INPUT_FILE_LOCATION_CONF, inputPath.toString()); // write each value to the sequence file int syncCounter = 0; for (final V input : values) { // each entry in the sequence file is a map input writer.append(NullWritable.get(), input); // syncing indicates an input split boundary if (++syncCounter % inputsPerSplit == 0) { writer.sync();/*from w ww .j av a 2 s .c o m*/ } } // close the input file writer.hflush(); writer.close(); // delete file when JVM exits inputPath.getFileSystem(job.getConfiguration()).deleteOnExit(inputPath); }
From source file:com.conductor.s3.S3InputFormatUtils.java
License:Apache License
/** * Efficiently gets the Hadoop {@link org.apache.hadoop.fs.FileStatus} for all S3 files under the provided * {@code dirs}/*from www. ja va 2 s. c o m*/ * * @param s3Client * s3 client * @param blockSize * the block size * @param dirs * the dirs to search through * @return the {@link org.apache.hadoop.fs.FileStatus} version of all S3 files under {@code dirs} */ static List<FileStatus> getFileStatuses(final AmazonS3 s3Client, final long blockSize, final Path... dirs) { final List<FileStatus> result = Lists.newArrayList(); for (final Path dir : dirs) { // get bucket and prefix from path final String bucket = S3HadoopUtils.getBucketFromPath(dir.toString()); final String prefix = S3HadoopUtils.getKeyFromPath(dir.toString()); // list request final ListObjectsRequest req = new ListObjectsRequest().withMaxKeys(Integer.MAX_VALUE) .withBucketName(bucket).withPrefix(prefix); // recursively page through all objects under the path for (ObjectListing listing = s3Client.listObjects(req); listing.getObjectSummaries() .size() > 0; listing = s3Client.listNextBatchOfObjects(listing)) { for (final S3ObjectSummary summary : listing.getObjectSummaries()) { final Path path = new Path( String.format("s3n://%s/%s", summary.getBucketName(), summary.getKey())); if (S3_PATH_FILTER.accept(path)) { result.add(new FileStatus(summary.getSize(), false, 1, blockSize, summary.getLastModified().getTime(), path)); } } // don't need to check the next listing if this one is not truncated if (!listing.isTruncated()) { break; } } } return result; }
From source file:com.continuent.tungsten.common.file.HdfsFileIO.java
License:Open Source License
/** * {@inheritDoc}//w ww . j a v a 2s . c o m * * @see com.continuent.tungsten.common.file.FileIO#write(com.continuent.tungsten.common.file.FilePath, * java.lang.String, java.lang.String, boolean) */ @Override public void write(FilePath path, String value, String charset, boolean fsync) throws FileIOException { // Write the data and flush to storage. This overwrites any // previous version. Path p = new Path(path.toString()); FSDataOutputStream os = null; try { os = (FSDataOutputStream) this.getOutputStream(path); BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(os, charset)); bw.write(value); bw.flush(); if (fsync) { os.hsync(); } } catch (IOException e) { throw new FileIOException("Unable to write data to file: uri=" + uri + " path=" + p.toString() + " value=" + safeSynopsis(value, 20), e); } finally { if (os != null) { try { os.close(); } catch (IOException e) { } } } }
From source file:com.continuent.tungsten.common.file.HdfsFileIO.java
License:Open Source License
/** * {@inheritDoc}/*from www . j a va 2 s .c o m*/ * * @see com.continuent.tungsten.common.file.FileIO#getOutputStream(com.continuent.tungsten.common.file.FilePath) */ public OutputStream getOutputStream(FilePath path) throws FileIOException { Path p = new Path(path.toString()); try { return hdfs.create(p, true); } catch (IOException e) { throw new FileIOException("Unable to write data to file: uri=" + uri + " path=" + p.toString(), e); } }
From source file:com.couchbase.sqoop.manager.CouchbaseManagerTest.java
License:Apache License
private void runCouchbaseTest(HashMap<String, String> expectedMap) throws IOException { Path warehousePath = new Path(this.getWarehouseDir()); Path tablePath = new Path(warehousePath, TABLE_NAME); Path filePath = new Path(tablePath, "part-m-00000"); File tableFile = new File(tablePath.toString()); if (tableFile.exists() && tableFile.isDirectory()) { // remove the directory before running the import. FileListing.recursiveDeleteDir(tableFile); }/*from www . j a v a 2s . c o m*/ String[] argv = getArgv(); try { runImport(argv); } catch (IOException ioe) { LOG.error("Got IOException during import: " + ioe.toString()); ioe.printStackTrace(); fail(ioe.toString()); } File f = new File(filePath.toString()); assertTrue("Could not find imported data file", f.exists()); BufferedReader r = null; try { // Read through the file and make sure it's all there. r = new BufferedReader(new InputStreamReader(new FileInputStream(f))); String line; int records = 0; while ((line = r.readLine()) != null) { compareRecords(expectedMap, line); records++; } if (records < NUM_RECORDS) { fail("Not everything was imported. Got " + records + "/" + NUM_RECORDS + " records."); } } catch (IOException ioe) { LOG.error("Got IOException verifying results: " + ioe.toString()); ioe.printStackTrace(); fail(ioe.toString()); } finally { IOUtils.closeStream(r); } }
From source file:com.datasalt.pangool.examples.avro.AvroTweetsJoin.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { failArguments("Wrong number of arguments"); return -1; }//w w w. j a va 2s . c o m Path tweetsPath = new Path(args[0]); Path retweetsPath = new Path(args[1]); Path outputPath = new Path(args[2]); delete(outputPath.toString()); TupleMRBuilder mr = new TupleMRBuilder(conf, "AvroTweetsJoin"); mr.addIntermediateSchema(getPangoolTweetSchema()); mr.addIntermediateSchema(getPangoolRetweetSchema()); mr.setGroupByFields("tweet_id"); mr.setOrderBy(new OrderBy().add("tweet_id", Order.ASC).addSchemaOrder(Order.ASC)); mr.setSpecificOrderBy("retweet", new OrderBy().add("username", Order.ASC)); mr.addInput(tweetsPath, new AvroInputFormat<Record>(getAvroTweetSchema()), new TweetsMapper()); mr.addInput(retweetsPath, new HadoopInputFormat(TextInputFormat.class), new RetweetsMapper()); mr.setOutput(outputPath, new AvroOutputFormat<Record>(getAvroOutputSchema()), AvroWrapper.class, NullWritable.class); mr.setTupleReducer(new Red()); try { Job job = mr.createJob(); job.waitForCompletion(true); } finally { mr.cleanUpInstanceFiles(); } return 0; }