List of usage examples for org.apache.hadoop.io NullWritable get
public static NullWritable get()
From source file:org.apache.hama.pipes.util.SequenceFileDumper.java
License:Apache License
public static void main(String[] args) throws Exception { CommandLineParser cli = new CommandLineParser(); if (args.length == 0) { cli.printUsage();// w w w . j a v a2s . co m return; } // Add arguments cli.addOption("file", false, "The Sequence File containing the Clusters", "path"); cli.addOption("output", false, "The output file. If not specified, dumps to the console", "path"); cli.addOption("substring", false, "The number of chars of the FormatString() to print", "number"); cli.addOption("count", false, "Report the count only", "number"); Parser parser = cli.createParser(); try { HamaConfiguration conf = new HamaConfiguration(); CommandLine cmdLine = parser.parse(cli.options, args); if (cmdLine.hasOption("file")) { Path path = new Path(cmdLine.getOptionValue("file")); FileSystem fs = FileSystem.get(path.toUri(), conf); if (!fs.isFile(path)) { System.out.println("File does not exist: " + path.toString()); return; } SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); Writer writer; if (cmdLine.hasOption("output")) { writer = new FileWriter(cmdLine.getOptionValue("output")); } else { writer = new OutputStreamWriter(System.out); } writer.append("Input Path: ").append(String.valueOf(path)).append(LINE_SEP); int sub = Integer.MAX_VALUE; if (cmdLine.hasOption("substring")) { sub = Integer.parseInt(cmdLine.getOptionValue("substring")); } Writable key; if (reader.getKeyClass() != NullWritable.class) { key = (Writable) reader.getKeyClass().newInstance(); } else { key = NullWritable.get(); } Writable value; if (reader.getValueClass() != NullWritable.class) { value = (Writable) reader.getValueClass().newInstance(); } else { value = NullWritable.get(); } writer.append("Key class: ").append(String.valueOf(reader.getKeyClass())).append(" Value Class: ") .append(String.valueOf(value.getClass())).append(LINE_SEP); writer.flush(); long count = 0; boolean countOnly = cmdLine.hasOption("count"); if (countOnly == false) { while (reader.next(key, value)) { writer.append("Key: ").append(String.valueOf(key)); String str = value.toString(); writer.append(": Value: ").append(str.length() > sub ? str.substring(0, sub) : str); writer.write(LINE_SEP); writer.flush(); count++; } writer.append("Count: ").append(String.valueOf(count)).append(LINE_SEP); } else { // count only while (reader.next(key, value)) { count++; } writer.append("Count: ").append(String.valueOf(count)).append(LINE_SEP); } writer.flush(); if (cmdLine.hasOption("output")) { writer.close(); } reader.close(); } else { cli.printUsage(); } } catch (ParseException e) { LOG.error(e.getMessage()); cli.printUsage(); return; } }
From source file:org.apache.hawq.pxf.plugins.hdfs.AvroFileAccessor.java
License:Apache License
/** * readNextObject/* w w w. j ava2 s. com*/ * The AVRO accessor is currently the only specialized accessor that * overrides this method. This happens, because the special * AvroRecordReader.next() semantics (use of the AvroWrapper), so it * cannot use the RecordReader's default implementation in * SplittableFileAccessor */ @Override public OneRow readNextObject() throws IOException { /** Resetting datum to null, to avoid stale bytes to be padded from the previous row's datum */ avroWrapper.datum(null); if (reader.next(avroWrapper, NullWritable.get())) { // There is one more record in the current split. return new OneRow(null, avroWrapper.datum()); } else if (getNextSplit()) { // The current split is exhausted. try to move to the next split. return reader.next(avroWrapper, NullWritable.get()) ? new OneRow(null, avroWrapper.datum()) : null; } // if neither condition was met, it means we already read all the records in all the splits, and // in this call record variable was not set, so we return null and thus we are signaling end of // records sequence - in this case avroWrapper.datum() will be null return null; }
From source file:org.apache.hcatalog.mapreduce.FileRecordWriterContainer.java
License:Apache License
@Override public void write(WritableComparable<?> key, HCatRecord value) throws IOException, InterruptedException { org.apache.hadoop.mapred.RecordWriter localWriter; ObjectInspector localObjectInspector; SerDe localSerDe;//ww w . ja v a2 s. c o m OutputJobInfo localJobInfo = null; if (dynamicPartitioningUsed) { // calculate which writer to use from the remaining values - this needs to be done before we delete cols List<String> dynamicPartValues = new ArrayList<String>(); for (Integer colToAppend : dynamicPartCols) { dynamicPartValues.add(value.get(colToAppend).toString()); } String dynKey = dynamicPartValues.toString(); if (!baseDynamicWriters.containsKey(dynKey)) { if ((maxDynamicPartitions != -1) && (baseDynamicWriters.size() > maxDynamicPartitions)) { throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS, "Number of dynamic partitions being created " + "exceeds configured max allowable partitions[" + maxDynamicPartitions + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname + "] if needed."); } org.apache.hadoop.mapred.TaskAttemptContext currTaskContext = HCatMapRedUtil .createTaskAttemptContext(context); configureDynamicStorageHandler(currTaskContext, dynamicPartValues); localJobInfo = HCatBaseOutputFormat.getJobInfo(currTaskContext); //setup serDe SerDe currSerDe = ReflectionUtils.newInstance(storageHandler.getSerDeClass(), currTaskContext.getJobConf()); try { InternalUtil.initializeOutputSerDe(currSerDe, currTaskContext.getConfiguration(), localJobInfo); } catch (SerDeException e) { throw new IOException("Failed to initialize SerDe", e); } //create base OutputFormat org.apache.hadoop.mapred.OutputFormat baseOF = ReflectionUtils .newInstance(storageHandler.getOutputFormatClass(), currTaskContext.getJobConf()); //We are skipping calling checkOutputSpecs() for each partition //As it can throw a FileAlreadyExistsException when more than one mapper is writing to a partition //See HCATALOG-490, also to avoid contacting the namenode for each new FileOutputFormat instance //In general this should be ok for most FileOutputFormat implementations //but may become an issue for cases when the method is used to perform other setup tasks //get Output Committer org.apache.hadoop.mapred.OutputCommitter baseOutputCommitter = currTaskContext.getJobConf() .getOutputCommitter(); //create currJobContext the latest so it gets all the config changes org.apache.hadoop.mapred.JobContext currJobContext = HCatMapRedUtil .createJobContext(currTaskContext); //setupJob() baseOutputCommitter.setupJob(currJobContext); //recreate to refresh jobConf of currTask context currTaskContext = HCatMapRedUtil.createTaskAttemptContext(currJobContext.getJobConf(), currTaskContext.getTaskAttemptID(), currTaskContext.getProgressible()); //set temp location currTaskContext.getConfiguration().set("mapred.work.output.dir", new FileOutputCommitter(new Path(localJobInfo.getLocation()), currTaskContext).getWorkPath() .toString()); //setupTask() baseOutputCommitter.setupTask(currTaskContext); Path parentDir = new Path(currTaskContext.getConfiguration().get("mapred.work.output.dir")); Path childPath = new Path(parentDir, FileOutputFormat.getUniqueFile(currTaskContext, "part", "")); org.apache.hadoop.mapred.RecordWriter baseRecordWriter = baseOF.getRecordWriter( parentDir.getFileSystem(currTaskContext.getConfiguration()), currTaskContext.getJobConf(), childPath.toString(), InternalUtil.createReporter(currTaskContext)); baseDynamicWriters.put(dynKey, baseRecordWriter); baseDynamicSerDe.put(dynKey, currSerDe); baseDynamicCommitters.put(dynKey, baseOutputCommitter); dynamicContexts.put(dynKey, currTaskContext); dynamicObjectInspectors.put(dynKey, InternalUtil.createStructObjectInspector(jobInfo.getOutputSchema())); dynamicOutputJobInfo.put(dynKey, HCatOutputFormat.getJobInfo(dynamicContexts.get(dynKey))); } localJobInfo = dynamicOutputJobInfo.get(dynKey); localWriter = baseDynamicWriters.get(dynKey); localSerDe = baseDynamicSerDe.get(dynKey); localObjectInspector = dynamicObjectInspectors.get(dynKey); } else { localJobInfo = jobInfo; localWriter = getBaseRecordWriter(); localSerDe = serDe; localObjectInspector = objectInspector; } for (Integer colToDel : partColsToDel) { value.remove(colToDel); } //The key given by user is ignored try { localWriter.write(NullWritable.get(), localSerDe.serialize(value.getAll(), localObjectInspector)); } catch (SerDeException e) { throw new IOException("Failed to serialize object", e); } }
From source file:org.apache.hcatalog.mapreduce.TestHCatHiveThriftCompatibility.java
License:Apache License
@Before @Override/*from w w w. j ava 2s . c o m*/ public void setUp() throws Exception { super.setUp(); if (setUpComplete) { return; } ByteArrayOutputStream out = new ByteArrayOutputStream(); TIOStreamTransport transport = new TIOStreamTransport(out); TBinaryProtocol protocol = new TBinaryProtocol(transport); IntString intString = new IntString(1, "one", 1); intString.write(protocol); BytesWritable bytesWritable = new BytesWritable(out.toByteArray()); intStringSeq = new Path(TEST_DATA_DIR + "/data/intString.seq"); LOG.info("Creating data file: " + intStringSeq); SequenceFile.Writer seqFileWriter = SequenceFile.createWriter(intStringSeq.getFileSystem(hiveConf), hiveConf, intStringSeq, NullWritable.class, BytesWritable.class); seqFileWriter.append(NullWritable.get(), bytesWritable); seqFileWriter.close(); setUpComplete = true; }
From source file:org.apache.hcatalog.mapreduce.TestHCatInputFormat.java
License:Apache License
/** * Create an input sequence file with 100 records; every 10th record is bad. * Load this table into Hive./*from w w w . ja va2 s . c o m*/ */ @Before @Override public void setUp() throws Exception { super.setUp(); if (setUpComplete) { return; } Path intStringSeq = new Path(TEST_DATA_DIR + "/data/intString.seq"); LOG.info("Creating data file: " + intStringSeq); SequenceFile.Writer seqFileWriter = SequenceFile.createWriter(intStringSeq.getFileSystem(hiveConf), hiveConf, intStringSeq, NullWritable.class, BytesWritable.class); ByteArrayOutputStream out = new ByteArrayOutputStream(); TIOStreamTransport transport = new TIOStreamTransport(out); TBinaryProtocol protocol = new TBinaryProtocol(transport); for (int i = 1; i <= 100; i++) { if (i % 10 == 0) { seqFileWriter.append(NullWritable.get(), new BytesWritable("bad record".getBytes())); } else { out.reset(); IntString intString = new IntString(i, Integer.toString(i), i); intString.write(protocol); BytesWritable bytesWritable = new BytesWritable(out.toByteArray()); seqFileWriter.append(NullWritable.get(), bytesWritable); } } seqFileWriter.close(); // Now let's load this file into a new Hive table. Assert.assertEquals(0, driver.run("drop table if exists test_bad_records").getResponseCode()); Assert.assertEquals(0, driver.run("create table test_bad_records " + "row format serde 'org.apache.hadoop.hive.serde2.thrift.ThriftDeserializer' " + "with serdeproperties ( " + " 'serialization.class'='org.apache.hadoop.hive.serde2.thrift.test.IntString', " + " 'serialization.format'='org.apache.thrift.protocol.TBinaryProtocol') " + "stored as" + " inputformat 'org.apache.hadoop.mapred.SequenceFileInputFormat'" + " outputformat 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'") .getResponseCode()); Assert.assertEquals(0, driver.run("load data local inpath '" + intStringSeq.getParent() + "' into table test_bad_records") .getResponseCode()); setUpComplete = true; }
From source file:org.apache.hive.hcatalog.mapreduce.FileRecordWriterContainer.java
License:Apache License
@Override public void write(WritableComparable<?> key, HCatRecord value) throws IOException, InterruptedException { LocalFileWriter localFileWriter = getLocalFileWriter(value); RecordWriter localWriter = localFileWriter.getLocalWriter(); ObjectInspector localObjectInspector = localFileWriter.getLocalObjectInspector(); SerDe localSerDe = localFileWriter.getLocalSerDe(); OutputJobInfo localJobInfo = localFileWriter.getLocalJobInfo(); for (Integer colToDel : partColsToDel) { value.remove(colToDel);//from www .java 2s . c o m } // The key given by user is ignored try { localWriter.write(NullWritable.get(), localSerDe.serialize(value.getAll(), localObjectInspector)); } catch (SerDeException e) { throw new IOException("Failed to serialize object", e); } }
From source file:org.apache.hive.jdbc.BaseJdbcWithMiniLlap.java
License:Apache License
private int processQuery(String currentDatabase, String query, int numSplits, RowProcessor rowProcessor) throws Exception { String url = miniHS2.getJdbcURL(); String user = System.getProperty("user.name"); String pwd = user;/*from ww w. j a v a 2 s . com*/ String handleId = UUID.randomUUID().toString(); InputFormat<NullWritable, Row> inputFormat = getInputFormat(); // Get splits JobConf job = new JobConf(conf); job.set(LlapBaseInputFormat.URL_KEY, url); job.set(LlapBaseInputFormat.USER_KEY, user); job.set(LlapBaseInputFormat.PWD_KEY, pwd); job.set(LlapBaseInputFormat.QUERY_KEY, query); job.set(LlapBaseInputFormat.HANDLE_ID, handleId); if (currentDatabase != null) { job.set(LlapBaseInputFormat.DB_KEY, currentDatabase); } InputSplit[] splits = inputFormat.getSplits(job, numSplits); assertTrue(splits.length > 0); // Fetch rows from splits boolean first = true; int rowCount = 0; for (InputSplit split : splits) { System.out.println("Processing split " + split.getLocations()); int numColumns = 2; RecordReader<NullWritable, Row> reader = inputFormat.getRecordReader(split, job, null); Row row = reader.createValue(); while (reader.next(NullWritable.get(), row)) { rowProcessor.process(row); ++rowCount; } //In arrow-mode this will throw exception unless all buffers have been released //See org.apache.hadoop.hive.llap.LlapArrowBatchRecordReader reader.close(); } LlapBaseInputFormat.close(handleId); return rowCount; }
From source file:org.apache.hive.jdbc.TestJdbcWithMiniLlap.java
License:Apache License
private int processQuery(String query, int numSplits, RowProcessor rowProcessor) throws Exception { String url = miniHS2.getJdbcURL(); String user = System.getProperty("user.name"); String pwd = user;//from ww w.jav a2s. c om LlapRowInputFormat inputFormat = new LlapRowInputFormat(); // Get splits JobConf job = new JobConf(conf); job.set(LlapBaseInputFormat.URL_KEY, url); job.set(LlapBaseInputFormat.USER_KEY, user); job.set(LlapBaseInputFormat.PWD_KEY, pwd); job.set(LlapBaseInputFormat.QUERY_KEY, query); InputSplit[] splits = inputFormat.getSplits(job, numSplits); assertTrue(splits.length > 0); // Fetch rows from splits boolean first = true; int rowCount = 0; for (InputSplit split : splits) { System.out.println("Processing split " + split.getLocations()); int numColumns = 2; RecordReader<NullWritable, Row> reader = inputFormat.getRecordReader(split, job, null); Row row = reader.createValue(); while (reader.next(NullWritable.get(), row)) { rowProcessor.process(row); ++rowCount; } } return rowCount; }
From source file:org.apache.hive.storage.jdbc.JdbcRecordReader.java
License:Apache License
@Override public boolean next(LongWritable key, MapWritable value) throws IOException { try {//from w w w.ja v a 2 s . co m LOGGER.debug("JdbcRecordReader.next called"); if (dbAccessor == null) { dbAccessor = DatabaseAccessorFactory.getAccessor(conf); iterator = dbAccessor.getRecordIterator(conf, split.getLimit(), split.getOffset()); } if (iterator.hasNext()) { LOGGER.debug("JdbcRecordReader has more records to read."); key.set(pos); pos++; Map<String, Object> record = iterator.next(); if ((record != null) && (!record.isEmpty())) { for (Entry<String, Object> entry : record.entrySet()) { value.put(new Text(entry.getKey()), entry.getValue() == null ? NullWritable.get() : new ObjectWritable(entry.getValue())); } return true; } else { LOGGER.debug("JdbcRecordReader got null record."); return false; } } else { LOGGER.debug("JdbcRecordReader has no more records to read."); return false; } } catch (Exception e) { LOGGER.error("An error occurred while reading the next record from DB.", e); return false; } }
From source file:org.apache.jena.grande.pig.RdfStorage.java
License:Apache License
@Override public void putNext(Tuple tuple) throws IOException { log.debug("putNext({})", tuple); try {/* www .j a va2 s. c o m*/ Node g = NodeEncoder.asNode((String) tuple.get(0)); Node s = NodeEncoder.asNode((String) tuple.get(1)); Node p = NodeEncoder.asNode((String) tuple.get(2)); Node o = NodeEncoder.asNode((String) tuple.get(3)); Quad quad = new Quad(g, s, p, o); QuadWritable quadWritable = new QuadWritable(quad); writer.write(NullWritable.get(), quadWritable); } catch (InterruptedException e) { throw new IOException(e); } }