List of usage examples for org.apache.hadoop.io SequenceFile createWriter
@Deprecated public static Writer createWriter(Configuration conf, FSDataOutputStream out, Class keyClass, Class valClass, CompressionType compressionType, CompressionCodec codec) throws IOException
From source file:org.apache.hama.ml.kmeans.KMeansBSP.java
License:Apache License
private void recalculateAssignmentsAndWrite( BSPPeer<VectorWritable, NullWritable, IntWritable, VectorWritable, CenterMessage> peer) throws IOException { final NullWritable value = NullWritable.get(); // also use our cache to speed up the final writes if exists if (cache == null) { final VectorWritable key = new VectorWritable(); IntWritable keyWrite = new IntWritable(); while (peer.readNext(key, value)) { final int lowestDistantCenter = getNearestCenter(key.getVector()); keyWrite.set(lowestDistantCenter); peer.write(keyWrite, key);//from w w w . ja va 2 s .c om } } else { IntWritable keyWrite = new IntWritable(); for (DoubleVector v : cache) { final int lowestDistantCenter = getNearestCenter(v); keyWrite.set(lowestDistantCenter); peer.write(keyWrite, new VectorWritable(v)); } } // just on the first task write the centers to filesystem to prevent // collisions if (peer.getPeerName().equals(peer.getPeerName(0))) { String pathString = conf.get(CENTER_OUT_PATH); if (pathString != null) { final SequenceFile.Writer dataWriter = SequenceFile.createWriter(FileSystem.get(conf), conf, new Path(pathString), VectorWritable.class, NullWritable.class, CompressionType.NONE); for (DoubleVector center : centers) { dataWriter.append(new VectorWritable(center), value); } dataWriter.close(); } } }
From source file:org.apache.hama.ml.kmeans.KMeansBSP.java
License:Apache License
/** * Reads input text files and writes it to a sequencefile. * /* www.ja va 2s. c o m*/ * @param k * @param conf * @param txtIn * @param center * @param out * @param fs * @param hasKey true if first column is required to be the key. * @return the path of a sequencefile. * @throws IOException */ public static Path prepareInputText(int k, Configuration conf, Path txtIn, Path center, Path out, FileSystem fs, boolean hasKey) throws IOException { Path in; if (fs.isFile(txtIn)) { in = new Path(txtIn.getParent(), "textinput/in.seq"); } else { in = new Path(txtIn, "textinput/in.seq"); } if (fs.exists(out)) fs.delete(out, true); if (fs.exists(center)) fs.delete(center, true); if (fs.exists(in)) fs.delete(in, true); final NullWritable value = NullWritable.get(); Writer centerWriter = new SequenceFile.Writer(fs, conf, center, VectorWritable.class, NullWritable.class); final SequenceFile.Writer dataWriter = SequenceFile.createWriter(fs, conf, in, VectorWritable.class, NullWritable.class, CompressionType.NONE); int i = 0; BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(txtIn))); String line; while ((line = br.readLine()) != null) { String[] split = line.split("\t"); int columnLength = split.length; int indexPos = 0; if (hasKey) { columnLength = columnLength - 1; indexPos++; } DenseDoubleVector vec = new DenseDoubleVector(columnLength); for (int j = 0; j < columnLength; j++) { vec.set(j, Double.parseDouble(split[j + indexPos])); } VectorWritable vector; if (hasKey) { NamedDoubleVector named = new NamedDoubleVector(split[0], vec); vector = new VectorWritable(named); } else { vector = new VectorWritable(vec); } dataWriter.append(vector, value); if (k > i) { centerWriter.append(vector, value); } i++; } br.close(); centerWriter.close(); dataWriter.close(); return in; }
From source file:org.apache.hama.ml.kmeans.KMeansBSP.java
License:Apache License
/** * Create some random vectors as input and assign the first k vectors as * intial centers.// w w w .j av a 2 s .c o m */ public static void prepareInput(int count, int k, int dimension, Configuration conf, Path in, Path center, Path out, FileSystem fs) throws IOException { if (fs.exists(out)) fs.delete(out, true); if (fs.exists(center)) fs.delete(center, true); if (fs.exists(in)) fs.delete(in, true); final SequenceFile.Writer centerWriter = SequenceFile.createWriter(fs, conf, center, VectorWritable.class, NullWritable.class, CompressionType.NONE); final NullWritable value = NullWritable.get(); final SequenceFile.Writer dataWriter = SequenceFile.createWriter(fs, conf, in, VectorWritable.class, NullWritable.class, CompressionType.NONE); Random r = new Random(); for (int i = 0; i < count; i++) { double[] arr = new double[dimension]; for (int d = 0; d < dimension; d++) { arr[d] = r.nextInt(count); } VectorWritable vector = new VectorWritable(new DenseDoubleVector(arr)); dataWriter.append(vector, value); if (k > i) { centerWriter.append(vector, value); } } centerWriter.close(); dataWriter.close(); }
From source file:org.apache.mahout.math.hadoop.stochasticsvd.qr.QRFirstStep.java
License:Apache License
private SequenceFile.Writer getTempQw() throws IOException { if (tempQw == null) { /*// w w w . j a v a2s. c o m * temporary Q output hopefully will not exceed size of IO cache in which * case it is only good since it is going to be managed by kernel, not * java GC. And if IO cache is not good enough, then at least it is always * sequential. */ String taskTmpDir = System.getProperty("java.io.tmpdir"); FileSystem localFs = FileSystem.getLocal(jobConf); Path parent = new Path(taskTmpDir); Path sub = new Path(parent, "qw_" + System.currentTimeMillis()); tempQPath = new Path(sub, "q-temp.seq"); tempQw = SequenceFile.createWriter(localFs, jobConf, tempQPath, IntWritable.class, DenseBlockWritable.class, CompressionType.BLOCK); closeables.addFirst(tempQw); closeables.addFirst(new IOUtils.DeleteFileOnClose(new File(tempQPath.toString()))); } return tempQw; }
From source file:org.apache.mrql.Bag.java
License:Apache License
/** add a new value to a Bag (cache it in memory if necessary) * @param x the new value// w ww. j a v a 2 s . c o m */ public void add(final MRData x) { materialize(); if (!spilled() && Config.hadoop_mode && size() >= Config.max_materialized_bag) spill(); if (spilled()) try { if (writer == null) { // writer was closed earlier for reading FileSystem fs = FileSystem.getLocal(Plan.conf); writer = SequenceFile.createWriter(fs, Plan.conf, new Path(path), MRContainer.class, NullWritable.class, SequenceFile.CompressionType.NONE); System.err.println("*** Appending elements to a spilled Bag: " + path); } ; writer.append(new MRContainer(x), NullWritable.get()); } catch (IOException e) { throw new Error("Cannot append an element to a spilled Bag: " + path); } else content.add(x); }
From source file:org.apache.mrql.Bag.java
License:Apache License
/** spill the Bag to a local file */ private void spill() { if (!spilled() && Config.hadoop_mode) try {//w w w .j a v a 2 s . c o m if (Plan.conf == null) Plan.conf = Evaluator.evaluator.new_configuration(); final FileSystem fs = FileSystem.getLocal(Plan.conf); path = new_path(fs); System.err.println("*** Spilling a Bag to a local file: " + path); final Path p = new Path(path); writer = SequenceFile.createWriter(fs, Plan.conf, new Path(path), MRContainer.class, NullWritable.class, SequenceFile.CompressionType.NONE); for (MRData e : this) writer.append(new MRContainer(e), NullWritable.get()); mode = Modes.SPILLED; content = null; iterator = null; } catch (Exception e) { throw new Error("Cannot spill a Bag to a local file"); } }
From source file:org.apache.mrql.Plan.java
License:Apache License
/** splits the range min..max into multiple ranges, one for each mapper */ public final static DataSet generator(int source_num, long min, long max, long split_length) throws Exception { if (min > max) throw new Error("Wrong range: " + min + "..." + max); if (split_length < 1) if (Config.bsp_mode) split_length = (max - min) / Config.nodes + 1; else// ww w .j a va 2 s . com split_length = Config.range_split_size; DataSet ds = new DataSet(0, 0); long i = min; while (i + split_length <= max) { String file = new_path(conf); Path path = new Path(file); SequenceFile.Writer writer = SequenceFile.createWriter(path.getFileSystem(conf), conf, path, MRContainer.class, MRContainer.class, SequenceFile.CompressionType.NONE); writer.append(new MRContainer(new MR_long(i)), new MRContainer(new Tuple(new MR_long(i), new MR_long(split_length)))); writer.close(); ds.source.add(new GeneratorDataSource(source_num, file, conf)); i += split_length; } ; if (i <= max) { String file = new_path(conf); Path path = new Path(file); SequenceFile.Writer writer = SequenceFile.createWriter(path.getFileSystem(conf), conf, path, MRContainer.class, MRContainer.class, SequenceFile.CompressionType.NONE); writer.append(new MRContainer(new MR_long(i)), new MRContainer(new Tuple(new MR_long(i), new MR_long(max - i + 1)))); writer.close(); ds.source.add(new GeneratorDataSource(source_num, file, conf)); } ; return ds; }
From source file:org.apache.tez.dag.history.logging.proto.ProtoMessageWriter.java
License:Apache License
ProtoMessageWriter(Configuration conf, Path filePath, Parser<T> parser) throws IOException { this.filePath = filePath; this.writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(filePath), SequenceFile.Writer.keyClass(NullWritable.class), SequenceFile.Writer.valueClass(ProtoMessageWritable.class), SequenceFile.Writer.appendIfExists(true), SequenceFile.Writer.compression(CompressionType.RECORD)); this.writable = new ProtoMessageWritable<>(parser); }
From source file:org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter.java
License:Open Source License
public SequenceFileSpillWriter(FileSystem fileSystem, Configuration conf, Path outputFilePath, Class<KeyType> keyClass, Class<ValueType> valueClass, SequenceFileIndexWriter<KeyType, ValueType> optionalIndexWriter, boolean compress) throws IOException { _indexWriter = optionalIndexWriter;// w ww. ja v a 2 s . c om _spillBufferSize = conf.getInt(SPILL_WRITER_BUFFER_SIZE_PARAM, DEFAULT_SPILL_BUFFER_SIZE); _outputStream = fileSystem.create(outputFilePath); // allocate buffer ... _activeBuffer = ByteBuffer.allocate(_spillBufferSize); if (compress) { Class codecClass = conf.getClass("mapred.output.compression.codec", DefaultCodec.class); CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); writer = SequenceFile.createWriter(conf, _outputStream, keyClass, valueClass, CompressionType.BLOCK, codec); } else { writer = SequenceFile.createWriter(conf, _outputStream, keyClass, valueClass, CompressionType.NONE, null); } _writerThread = new Thread(new Runnable() { @Override public void run() { // LOG.info("Writer Thread Starting"); while (true) { QueuedBufferItem queuedBufferItem = null; try { queuedBufferItem = _bufferQueue.take(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } if (queuedBufferItem._buffer == null) { // LOG.info("Writer Thread received empty buffer item. Exiting"); return; } else { ByteBuffer theBuffer = queuedBufferItem._buffer; // LOG.info("Writer Thread received item. Limit:" + // theBuffer.limit()); // get byte pointer byte[] bufferAsBytes = theBuffer.array(); int itemsWritten = 0; long timeStart = System.currentTimeMillis(); while (theBuffer.remaining() != 0) { // now read in key length int keyLen = theBuffer.getInt(); // mark key position int keyPos = theBuffer.position(); // now skip past key length theBuffer.position(keyPos + keyLen); // read value length int valueLen = theBuffer.getInt(); // mark value position int valuePosition = theBuffer.position(); // now skip past it ... theBuffer.position(valuePosition + valueLen); // now write this out to the sequence file ... try { spillRawRecord2(bufferAsBytes, keyPos, keyLen, bufferAsBytes, valuePosition, valueLen); } catch (IOException e) { LOG.error("Writer Thread Failed with Error:" + StringUtils.stringifyException(e)); _writerException = e; return; } itemsWritten++; } // LOG.info("Writer Thread Finished With Buffer. Wrote:"+ // itemsWritten + " in:" + (System.currentTimeMillis() - // timeStart)); } } } }); _writerThread.start(); }
From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkMergerJob.java
License:Open Source License
@Override public void configure(JobConf job) { HashSet<Integer> onlyDoPartitions = null; String hack = job.get("hack"); if (hack != null) { onlyDoPartitions = new HashSet<Integer>(); JsonParser parser = new JsonParser(); JsonArray hackArray = parser.parse(hack).getAsJsonArray(); for (JsonElement element : hackArray) { onlyDoPartitions.add(element.getAsInt()); }//www.j a v a2s .c o m } _conf = job; try { _fs = FileSystem.get(_conf); int partitionId = _conf.getInt("mapred.task.partition", 0); if (onlyDoPartitions == null || onlyDoPartitions.contains(partitionId)) { Path redirectPath = new Path(FileOutputFormat.getWorkOutputPath(_conf), "redirect-" + NUMBER_FORMAT.format(partitionId)); _redirectWriter = SequenceFile.createWriter(_fs, _conf, redirectPath, TextBytes.class, TextBytes.class, CompressionType.BLOCK); } else { _skipPartition = true; } } catch (IOException e) { e.printStackTrace(); } }