List of usage examples for org.apache.hadoop.fs Path getParent
public Path getParent()
From source file:com.mongodb.hadoop.BSONFileOutputFormat.java
License:Apache License
@Override public RecordWriter<K, V> getRecordWriter(final TaskAttemptContext context) throws IOException { // Open data output stream Path outPath = getDefaultWorkFile(context, ".bson"); LOG.info("output going into " + outPath); FileSystem fs = outPath.getFileSystem(context.getConfiguration()); FSDataOutputStream outFile = fs.create(outPath); FSDataOutputStream splitFile = null; if (MongoConfigUtil.getBSONOutputBuildSplits(context.getConfiguration())) { Path splitPath = new Path(outPath.getParent(), "." + outPath.getName() + ".splits"); splitFile = fs.create(splitPath); }//ww w .ja v a 2 s . com long splitSize = BSONSplitter.getSplitSize(context.getConfiguration(), null); return new BSONFileRecordWriter<K, V>(outFile, splitFile, splitSize); }
From source file:com.mongodb.hadoop.hive.output.HiveBSONFileOutputFormat.java
License:Apache License
/** * create the final output file/*from w w w .j av a 2 s.com*/ * * @param jc the job configuration * @param fileOutputPath the file that the output should be directed at * @param valueClass the value class used to create * @param tableProperties the tableInfo for this file's corresponding table * @return RecordWriter for the output file */ @Override public RecordWriter getHiveRecordWriter(final JobConf jc, final Path fileOutputPath, final Class<? extends Writable> valueClass, final boolean isCompressed, final Properties tableProperties, final Progressable progress) throws IOException { LOG.info("Output going into " + fileOutputPath); FileSystem fs = fileOutputPath.getFileSystem(jc); FSDataOutputStream outFile = fs.create(fileOutputPath); FSDataOutputStream splitFile = null; if (MongoConfigUtil.getBSONOutputBuildSplits(jc)) { Path splitPath = new Path(fileOutputPath.getParent(), "." + fileOutputPath.getName() + ".splits"); splitFile = fs.create(splitPath); } long splitSize = BSONSplitter.getSplitSize(jc, null); return new HiveBSONFileRecordWriter(outFile, splitFile, splitSize); }
From source file:com.mongodb.hadoop.mapred.BSONFileOutputFormat.java
License:Apache License
public RecordWriter<K, V> getRecordWriter(final FileSystem ignored, final JobConf job, final String name, final Progressable progress) throws IOException { Path outPath = getDefaultWorkFile(job, name, ".bson"); LOG.info("output going into " + outPath); FileSystem fs = outPath.getFileSystem(job); FSDataOutputStream outFile = fs.create(outPath); FSDataOutputStream splitFile = null; if (MongoConfigUtil.getBSONOutputBuildSplits(job)) { Path splitPath = new Path(outPath.getParent(), "." + outPath.getName() + ".splits"); splitFile = fs.create(splitPath); }//from ww w .j av a 2s . c om long splitSize = BSONSplitter.getSplitSize(job, null); return new BSONFileRecordWriter<K, V>(outFile, splitFile, splitSize); }
From source file:com.moz.fiji.mapreduce.IntegrationTestFijiTableInputFormat.java
License:Apache License
public Job setupJob(String jobName, Path outputFile, Class<? extends Mapper> mapperClass, Class<? extends Reducer> reducerClass, EntityId startKey, EntityId limitKey, FijiRowFilter filter) throws Exception { final Job job = new Job(createConfiguration()); final Configuration conf = job.getConfiguration(); // Get settings for test. final FijiDataRequest request = FijiDataRequest.builder() .addColumns(ColumnsDef.create().add("info", "name").add("info", "email")).build(); job.setJarByClass(IntegrationTestFijiTableInputFormat.class); // Setup the InputFormat. FijiTableInputFormat.configureJob(job, getFooTable().getURI(), request, startKey, limitKey, filter); job.setInputFormatClass(HBaseFijiTableInputFormat.class); // Duplicate functionality from MapReduceJobBuilder, since we are not using it here: final List<Path> jarFiles = Lists.newArrayList(); final FileSystem fs = FileSystem.getLocal(conf); for (String cpEntry : System.getProperty("java.class.path").split(":")) { if (cpEntry.endsWith(".jar")) { jarFiles.add(fs.makeQualified(new Path(cpEntry))); }//from ww w . j ava2 s.co m } DistributedCacheJars.addJarsToDistributedCache(job, jarFiles); // Create a test job. job.setJobName(jobName); // Setup the OutputFormat. TextOutputFormat.setOutputPath(job, outputFile.getParent()); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); // Set the mapper class. if (null != mapperClass) { job.setMapperClass(mapperClass); } // Set the reducer class. if (null != reducerClass) { job.setReducerClass(reducerClass); } return job; }
From source file:com.moz.fiji.mapreduce.IntegrationTestFijiTableInputFormat.java
License:Apache License
/** Test FijiTableInputFormat in a map-only job. */ @Test/*from w w w . j a v a 2 s.c om*/ public void testMapJob() throws Exception { final Path outputFile = createOutputFile(); // Create a test job. final Job job = setupJob("testMapJob", outputFile, TestMapper.class, null, // reducer class null, // start key null, // limit key null); // filter // Run the job. assertTrue("Hadoop job failed", job.waitForCompletion(true)); // Check to make sure output exists. final FileSystem fs = FileSystem.get(job.getConfiguration()); assertTrue(fs.exists(outputFile.getParent())); // Verify that the output matches what's expected. final FSDataInputStream in = fs.open(outputFile); final Set<String> actual = Sets.newHashSet(IOUtils.toString(in).trim().split("\n")); final Set<String> expected = Sets.newHashSet("usermail.example.com\tAaron Kimball", "gmail.com\tJohn Doe", "usermail.example.com\tChristophe Bisciglia", "usermail.example.com\tKiyan Ahmadizadeh", "gmail.com\tJane Doe", "usermail.example.com\tGarrett Wu"); assertEquals("Result of job wasn't what was expected", expected, actual); // Clean up. fs.delete(outputFile.getParent(), true); IOUtils.closeQuietly(in); // NOTE: fs should get closed here, but doesn't because of a bug with FileSystem that // causes it to close other thread's filesystem objects. For more information // see: https://issues.apache.org/jira/browse/HADOOP-7973 }
From source file:com.moz.fiji.mapreduce.IntegrationTestFijiTableInputFormat.java
License:Apache License
/** Test FijiTableInputFormat in a map-only job with start and limit keys. */ @Test// w ww .ja v a 2 s. c o m public void testMapJobWithStartAndLimitKeys() throws Exception { final Path outputFile = createOutputFile(); // Set the same entity IDs for start and limit, and we should get just the start row final EntityId startEntityId = getFooTable().getEntityId("jane.doe@gmail.com"); final byte[] endRowKey = startEntityId.getHBaseRowKey(); final EntityId rawLimitEntityId = HBaseEntityId .fromHBaseRowKey(Arrays.copyOf(endRowKey, endRowKey.length + 1)); // Create a test job. final Job job = setupJob("testMapJobWithStartAndLimitKeys", outputFile, TestMapper.class, null, // reducer class startEntityId, rawLimitEntityId, null); // filter // Run the job. assertTrue("Hadoop job failed", job.waitForCompletion(true)); // Check to make sure output exists. final FileSystem fs = FileSystem.get(job.getConfiguration()); assertTrue(fs.exists(outputFile.getParent())); // Verify that the output matches what's expected. final FSDataInputStream in = fs.open(outputFile); final Set<String> actual = Sets.newHashSet(IOUtils.toString(in).trim().split("\n")); final Set<String> expected = Sets.newHashSet("gmail.com\tJane Doe"); assertEquals("Result of job wasn't what was expected", expected, actual); // Clean up. fs.delete(outputFile.getParent(), true); IOUtils.closeQuietly(in); // NOTE: fs should get closed here, but doesn't because of a bug with FileSystem that // causes it to close other thread's filesystem objects. For more information // see: https://issues.apache.org/jira/browse/HADOOP-7973 }
From source file:com.moz.fiji.mapreduce.IntegrationTestFijiTableInputFormat.java
License:Apache License
/** Test FijiTableInputFormat in a map-only job with a row filter. */ @Test//from w w w .j a va 2 s. c o m public void testMapJobWithFilter() throws Exception { final FijiRowFilter filter = new ColumnValueEqualsRowFilter("info", "email", new DecodedCell<String>(Schema.create(Schema.Type.STRING), "aaron@usermail.example.com")); final Path outputFile = createOutputFile(); // Create a test job. final Job job = setupJob("testMapJobWithFilter", outputFile, TestMapper.class, null, // reducer class null, // start key null, // limit key filter); // Run the job. assertTrue("Hadoop job failed", job.waitForCompletion(true)); // Check to make sure output exists. final FileSystem fs = FileSystem.get(job.getConfiguration()); assertTrue(fs.exists(outputFile.getParent())); // Verify that the output matches what's expected. final FSDataInputStream in = fs.open(outputFile); final Set<String> actual = Sets.newHashSet(IOUtils.toString(in).trim().split("\n")); final Set<String> expected = Sets.newHashSet("usermail.example.com\tAaron Kimball"); assertEquals("Result of job wasn't what was expected", expected, actual); // Clean up. fs.delete(outputFile.getParent(), true); IOUtils.closeQuietly(in); // NOTE: fs should get closed here, but doesn't because of a bug with FileSystem that // causes it to close other thread's filesystem objects. For more information // see: https://issues.apache.org/jira/browse/HADOOP-7973 }
From source file:com.moz.fiji.mapreduce.IntegrationTestFijiTableInputFormat.java
License:Apache License
/** Test FijiTableInputFormat in a MapReduce job. */ @Test//from w w w . j av a 2 s .co m public void testMapReduceJob() throws Exception { final Path outputFile = createOutputFile(); // Create a test job. final Job job = setupJob("testMapReduceJob", outputFile, TestMapper.class, TestReducer.class, null, // start key null, // limit key null); // filter // Run the job. assertTrue("Hadoop job failed", job.waitForCompletion(true)); // Check to make sure output exists. final FileSystem fs = FileSystem.get(job.getConfiguration()); assertTrue(fs.exists(outputFile.getParent())); // Verify that the output matches what's expected. final FSDataInputStream in = fs.open(outputFile); final Set<String> output = Sets.newHashSet(IOUtils.toString(in).trim().split("\n")); final ImmutableMap.Builder<String, Set<String>> builder = ImmutableMap.builder(); for (String line : output) { final String[] keyValue = line.split("\t"); final String emailDomain = keyValue[0]; final Set<String> names = Sets.newHashSet(keyValue[1].split(",")); builder.put(emailDomain, names); } final Map<String, Set<String>> actual = builder.build(); final Map<String, Set<String>> expected = ImmutableMap.<String, Set<String>>builder() .put("usermail.example.com", Sets.newHashSet("Aaron Kimball", "Christophe Bisciglia", "Kiyan Ahmadizadeh", "Garrett Wu")) .put("gmail.com", Sets.newHashSet("John Doe", "Jane Doe")).build(); assertEquals("Result of job wasn't what was expected", expected, actual); // Clean up. fs.delete(outputFile.getParent(), true); IOUtils.closeQuietly(in); // NOTE: fs should get closed here, but doesn't because of a bug with FileSystem that // causes it to close other thread's filesystem objects. For more information // see: https://issues.apache.org/jira/browse/HADOOP-7973 }
From source file:com.moz.fiji.mapreduce.lib.examples.News20BulkImporter.java
License:Apache License
/** * Reads a single news article, and writes its contents to a new fiji row, * indexed by the article's name (A string consisting of the parent folder, and * this article's hash), and the a priori categorization of this article. * * @param key The fully qualified path to the current file we're reading. * @param value The raw data to insert into this column. * @param context The context to write to. * @throws IOException if there is an error. *//*from ww w . j a v a 2s .com*/ @Override public void produce(Text key, Text value, FijiTableContext context) throws IOException { Path qualifiedPath = new Path(key.toString()); // Category is specified on the containing folder. String category = qualifiedPath.getParent().getName(); // Name is the concatenation of category and file name. String name = category + "." + qualifiedPath.getName(); // write name, category, and raw article. EntityId entity = context.getEntityId(name); context.put(entity, FAMILY, ARTICLE_NAME_QUALIFIER, name); context.put(entity, FAMILY, CATEGORY_QUALIFIER, category); context.put(entity, FAMILY, RAW_ARTICLE_QUALIFIER, value.toString()); }
From source file:com.mvad.flink.demo.streaming.lib.sink.bucketing.BucketingSink.java
License:Apache License
/** * Opens a new part file.//from w w w. ja v a2s . c om * * <p> * This closes the old bucket file and retrieves a new bucket path from the {@code Bucketer}. */ private void openNewPartFile(Path bucketPath, BucketState<T> bucketState) throws Exception { closeCurrentPartFile(bucketState); FileSystem fs = new Path(basePath).getFileSystem(hadoopConf); if (!fs.exists(bucketPath)) { try { if (fs.mkdirs(bucketPath)) { LOG.debug("Created new bucket directory: {}", bucketPath); } } catch (IOException e) { throw new RuntimeException("Could not create new bucket path.", e); } } Path partPath = new Path(bucketPath, partPrefix + "-" + subtaskIndex + "-" + bucketState.partCounter + partSuffix); // This should work since there is only one parallel subtask that tries names with // our subtask id. Otherwise we would run into concurrency issues here. while (fs.exists(partPath) || fs .exists(new Path(partPath.getParent(), pendingPrefix + partPath.getName()).suffix(pendingSuffix))) { bucketState.partCounter++; partPath = new Path(bucketPath, partPrefix + "-" + subtaskIndex + "-" + bucketState.partCounter + partSuffix); } // increase, so we don't have to check for this name next time bucketState.partCounter++; LOG.debug("Next part path is {}", partPath.toString()); bucketState.currentFile = partPath.toString(); Path inProgressPath = new Path(partPath.getParent(), inProgressPrefix + partPath.getName()) .suffix(inProgressSuffix); // If we don't already have a writer for this bucket, create one if (bucketState.writer == null) { bucketState.writer = writerTemplate.duplicate(); } bucketState.writer.open(fs, inProgressPath); bucketState.isWriterOpen = true; }