List of usage examples for org.apache.hadoop.io SequenceFile createWriter
@Deprecated public static Writer createWriter(Configuration conf, FSDataOutputStream out, Class keyClass, Class valClass, CompressionType compressionType, CompressionCodec codec) throws IOException
From source file:com.sequenceiq.yarntest.mr.QuasiMonteCarlo.java
License:Apache License
/** * Run a map/reduce job for estimating Pi. * * @return the estimated value of Pi//from ww w . j av a2 s . c om */ public static JobID submitPiEstimationMRApp(String jobName, int numMaps, long numPoints, Path tmpDir, Configuration conf) throws IOException, ClassNotFoundException, InterruptedException { Job job = new Job(conf); //setup job conf job.setJobName(jobName); job.setJarByClass(QuasiMonteCarlo.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(BooleanWritable.class); job.setOutputValueClass(LongWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(QmcMapper.class); job.setReducerClass(QmcReducer.class); job.setNumReduceTasks(1); // turn off speculative execution, because DFS doesn't handle // multiple writers to the same file. job.setSpeculativeExecution(false); //setup input/output directories final Path inDir = new Path(tmpDir, "in"); final Path outDir = new Path(tmpDir, "out"); FileInputFormat.setInputPaths(job, inDir); FileOutputFormat.setOutputPath(job, outDir); final FileSystem fs = FileSystem.get(conf); if (fs.exists(tmpDir)) { fs.delete(tmpDir, true); // throw new IOException("Tmp directory " + fs.makeQualified(tmpDir) // + " already exists. Please remove it first."); } if (!fs.mkdirs(inDir)) { throw new IOException("Cannot create input directory " + inDir); } // try { //generate an input file for each map task for (int i = 0; i < numMaps; ++i) { final Path file = new Path(inDir, "part" + i); final LongWritable offset = new LongWritable(i * numPoints); final LongWritable size = new LongWritable(numPoints); final SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, LongWritable.class, LongWritable.class, CompressionType.NONE); try { writer.append(offset, size); } finally { writer.close(); } System.out.println("Wrote input for Map #" + i); } //start a map/reduce job System.out.println("Starting Job"); final long startTime = System.currentTimeMillis(); job.submit(); // final double duration = (System.currentTimeMillis() - startTime)/1000.0; // System.out.println("Job Finished in " + duration + " seconds"); return job.getJobID(); // } finally { // fs.delete(tmpDir, true); // } }
From source file:com.test.PiEstimatorKrb.java
License:Apache License
/** * Run a map/reduce job for estimating Pi. * * @return the estimated value of Pi//from w w w .ja va 2 s. c o m */ public static BigDecimal estimate(int numMaps, long numPoints, JobConf jobConf) throws IOException { //setup job conf jobConf.setJobName(PiEstimatorKrb.class.getSimpleName()); jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputKeyClass(BooleanWritable.class); jobConf.setOutputValueClass(LongWritable.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setMapperClass(PiMapper.class); jobConf.setNumMapTasks(numMaps); jobConf.setReducerClass(PiReducer.class); jobConf.setNumReduceTasks(1); // turn off speculative execution, because DFS doesn't handle // multiple writers to the same file. jobConf.setSpeculativeExecution(false); //setup input/output directories final Path inDir = new Path(TMP_DIR, "in"); final Path outDir = new Path(TMP_DIR, "out"); FileInputFormat.setInputPaths(jobConf, inDir); FileOutputFormat.setOutputPath(jobConf, outDir); final FileSystem fs = FileSystem.get(jobConf); if (fs.exists(TMP_DIR)) { throw new IOException( "Tmp directory " + fs.makeQualified(TMP_DIR) + " already exists. Please remove it first."); } if (!fs.mkdirs(inDir)) { throw new IOException("Cannot create input directory " + inDir); } try { //generate an input file for each map task for (int i = 0; i < numMaps; ++i) { final Path file = new Path(inDir, "part" + i); final LongWritable offset = new LongWritable(i * numPoints); final LongWritable size = new LongWritable(numPoints); final SequenceFile.Writer writer = SequenceFile.createWriter(fs, jobConf, file, LongWritable.class, LongWritable.class, CompressionType.NONE); try { writer.append(offset, size); } finally { writer.close(); } sLogger.info("Wrote input for Map #" + i); } //start a map/reduce job sLogger.info("Starting Job"); final long startTime = System.currentTimeMillis(); if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { jobConf.set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION")); } JobClient.runJob(jobConf); final double duration = (System.currentTimeMillis() - startTime) / 1000.0; sLogger.info("Job Finished in " + duration + " seconds"); //read outputs Path inFile = new Path(outDir, "reduce-out"); LongWritable numInside = new LongWritable(); LongWritable numOutside = new LongWritable(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, jobConf); try { reader.next(numInside, numOutside); } finally { reader.close(); } //compute estimated value return BigDecimal.valueOf(4).setScale(20).multiply(BigDecimal.valueOf(numInside.get())) .divide(BigDecimal.valueOf(numMaps)).divide(BigDecimal.valueOf(numPoints)); } finally { fs.delete(TMP_DIR, true); } }
From source file:edu.berkeley.chukwa_xtrace.TestXtrExtract.java
License:Apache License
public void writeASinkFile(Configuration conf, FileSystem fileSys, Path dest, int chunks) throws IOException { FSDataOutputStream out = fileSys.create(dest); SequenceFile.Writer seqFileWriter = SequenceFile.createWriter(conf, out, ChukwaArchiveKey.class, ChunkImpl.class, SequenceFile.CompressionType.NONE, null); //FIXME: do write here seqFileWriter.close();// w w w . j av a2s . c o m out.close(); }
From source file:edu.brown.cs.mapreduce.generator.DataLoader.java
License:Open Source License
/** * @param args/* www . j a va 2 s .c o m*/ */ public static void main(String[] args) { List<String> otherArgs = new ArrayList<String>(); for (int i = 0; i < args.length; i++) { if ("-compress".equals(args[i])) { DataLoader.compress = true; DataLoader.sequence = true; } else if ("-sequence".equals(args[i])) { DataLoader.sequence = true; } else if ("-tuple".equals(args[i])) { DataLoader.tuple = true; } else if ("-local".equals(args[i])) { DataLoader.local = true; } else if ("-limit".equals(args[i])) { DataLoader.limit = Integer.parseInt(args[++i]); } else if ("-xargs".equals(args[i])) { DataLoader.xargs = true; } else if ("-debug".equals(args[i])) { DataLoader.debug = true; } else { otherArgs.add(args[i]); } } // FOR if (otherArgs.size() < 3 && !DataLoader.xargs) { System.err.println("USAGE: DataLoader <input type> <input file> <output file>"); System.exit(1); } String input_type = otherArgs.get(0).toLowerCase(); String input_file = otherArgs.get(1); String output_file = null; if (DataLoader.xargs) { output_file = input_file + ".dl"; } else { output_file = otherArgs.get(2); } boolean valid = false; for (String type : DataLoader.VALID_TYPES) { if (type.equals(input_type)) { valid = true; break; } } if (!valid) { System.err.println("ERROR: Invalid input data type '" + input_type + "'"); System.exit(1); } if (debug) { System.out.println("Input Type: " + input_type); System.out.println("Input File: " + input_file); System.out.println("Output File: " + output_file); System.out.println("Limit: " + DataLoader.limit); System.out.println("Local: " + DataLoader.local); System.out.println("XArgs: " + DataLoader.xargs); } // // Get HDFS filesystem object that we can use for writing // FileSystem fs = null; Configuration conf = null; if (!DataLoader.local) { conf = AbstractHadoopClient.getConfiguration(); try { fs = FileSystem.get(conf); } catch (Exception ex) { ex.printStackTrace(); System.exit(-1); } if (debug) System.out.println("fs.default.name: " + conf.get("fs.default.name")); } // // Now open the file that we want to read and start writing the contents to our file system // For some things, like 'urls' we will want reverse the order so that the data makes sense // in our key->value paradigm // BufferedReader in = null; DataOutputStream out = null; SequenceFile.Writer writer = null; int lines = 0; try { if (input_file.equals("-")) { in = new BufferedReader(new InputStreamReader(System.in)); } else { in = new BufferedReader(new FileReader(input_file)); } } catch (FileNotFoundException ex) { System.err.println("ERROR: The input file '" + input_file + "' was not found : " + ex.getMessage()); System.exit(1); } try { if (!DataLoader.local) { // // FileSystem Writer // if (!DataLoader.sequence) { out = fs.create(new Path(output_file)); // // SequenceFile Writer // } else { if (input_type.equals("sortgrep")) DataLoader.tuple = false; if (DataLoader.debug) System.out.print("Creating " + (DataLoader.compress ? "compressed " : "") + "SequenceFile.Writer for '" + output_file + "': "); Class<? extends Writable> key_class = Text.class; Class<? extends Writable> value_class = null; if (DataLoader.tuple) { if (input_type.equals("uservisits")) value_class = UserVisitsTuple.class; if (input_type.equals("rankings")) value_class = RankingsTuple.class; } else { value_class = Text.class; } writer = SequenceFile.createWriter(fs, conf, new Path(output_file), key_class, value_class, (DataLoader.compress ? SequenceFile.CompressionType.BLOCK : SequenceFile.CompressionType.NONE)); if (DataLoader.debug) System.out.println("DONE!"); } // // Local Filesystem // } else { out = new DataOutputStream(new FileOutputStream(output_file, true)); } } catch (IOException ex) { System.err.println("ERROR: Failed to open output file '" + output_file + "' : " + ex.getMessage()); System.exit(1); } try { // // Now read in each line of the input file and append it to our output // while (in.ready()) { // // Ignore any misformated lines // String line = null; String key = ""; String value = ""; try { line = in.readLine(); String data[] = line.split("\\" + BenchmarkBase.VALUE_DELIMITER); // // Switch the two values in a rankings record // if (input_type.equals("rankings")) { key = data[1]; value = data[0]; for (int i = 2; i < data.length; i++) { value += BenchmarkBase.VALUE_DELIMITER + data[i]; } // FOR // // Change the comma to a tab // } else if (input_type.equals("convert") || input_type.equals("uservisits")) { key = data[0]; for (int i = 1; i < data.length; i++) { if (i != 1) value += BenchmarkBase.VALUE_DELIMITER; value += data[i]; } // FOR // // Don't do anything with the SortGrep data! // } else if (input_type.equals("sortgrep")) { key = line.substring(0, 10); value = line.substring(10); // // All others need to switch the first VALUE_DELIMITER to a KEYVALUE_DELIMITER // } else { line = line.replaceFirst(BenchmarkBase.VALUE_DELIMITER, BenchmarkBase.KEYVALUE_DELIMITER); } if (DataLoader.local || !DataLoader.sequence) { line = key + BenchmarkBase.KEYVALUE_DELIMITER + value + "\n"; out.write(line.getBytes()); } else { //if (DataLoader.debug) System.out.println("[" + lines + "] " + key + " => " + value); if (DataLoader.tuple) { try { data = value.split("\\" + BenchmarkBase.VALUE_DELIMITER); Writable tuple_values[] = new Writable[data.length]; Class<?> types[] = (input_type.equals("uservisits") ? BenchmarkBase.USERVISITS_TYPES : BenchmarkBase.RANKINGS_TYPES); for (int ctr = 0; ctr < data.length; ctr++) { // // Important! You have to subtract one from the types list // because the first one is really the key, but we're creating a tuple // on just the values!! // if (types[ctr + 1] == Text.class) { tuple_values[ctr] = new Text(data[ctr]); } else if (types[ctr + 1] == IntWritable.class) { tuple_values[ctr] = new IntWritable(Integer.valueOf(data[ctr])); } else if (types[ctr + 1] == DoubleWritable.class) { tuple_values[ctr] = new DoubleWritable(Double.valueOf(data[ctr])); } else if (types[ctr + 1] == LongWritable.class) { tuple_values[ctr] = new LongWritable(Long.valueOf(data[ctr])); } else if (types[ctr + 1] == FloatWritable.class) { tuple_values[ctr] = new FloatWritable(Float.valueOf(data[ctr])); } else { System.err.println("Unsupported Class: " + types[ctr + 1]); System.exit(1); } if (DataLoader.debug) System.out.println("tuple_values[" + ctr + "] = " + tuple_values[ctr]); } AbstractTuple tuple = (input_type.equals("uservisits") ? new UserVisitsTuple(tuple_values) : new RankingsTuple(tuple_values)); if (DataLoader.debug) System.out.println("STORING TUPLE: " + tuple + " (DATA " + data + " | VALUE " + value + ")"); writer.append(new Text(key), tuple); } catch (Exception ex) { ex.printStackTrace(); System.err.println("Error[" + output_file + "]"); System.err.println("## Line: " + lines); System.err.println("## Content: " + line); } } else { writer.append(new Text(key), new Text(value)); } } lines++; if (DataLoader.limit != null && lines >= DataLoader.limit) break; if (DataLoader.debug && lines % 1000000 == 0) System.out.println( "\tWrote " + lines + " '" + input_type + "' records to '" + output_file + "'"); } catch (Exception ex) { System.err.println("Error[" + output_file + "]"); System.err.println("## Line: " + lines); System.err.println("## Content: " + line); ex.printStackTrace(); System.exit(1); } } // WHILE } catch (Exception ex) { ex.printStackTrace(); System.exit(1); } finally { try { if (in != null) in.close(); if (out != null) out.close(); if (writer != null) writer.close(); } catch (Exception ex) { ex.printStackTrace(); System.exit(1); } } System.out.println("Wrote " + lines + " '" + input_type + "' records to '" + output_file + "'"); }
From source file:edu.indiana.soic.ts.mapreduce.pwd.PairWiseDistance.java
License:Open Source License
private void distributeData(int blockSize, Configuration conf, FileSystem fs, Path hdInputDir, int noOfDivisions) throws IOException { // Writing block meta data to for each block in a separate file so that // Hadoop will create separate Map tasks for each block.. // Key : block number // Value: row#column#isDiagonal#base_file_name // TODO : find a better way to do this. for (int row = 0; row < noOfDivisions; row++) { for (int column = 0; column < noOfDivisions; column++) { // using the load balancing algorithm to select the blocks // include the diagonal blocks as they are blocks, not // individual pairs if (((row >= column) & ((row + column) % 2 == 0)) | ((row <= column) & ((row + column) % 2 == 1))) { Path vFile = new Path(hdInputDir, "data_file_" + row + "_" + column); SequenceFile.Writer vWriter = SequenceFile.createWriter(fs, conf, vFile, LongWritable.class, Text.class, CompressionType.NONE); boolean isDiagonal = false; if (row == column) { isDiagonal = true;//from www.j a v a 2s . c om } String value = row + Constants.BREAK + column + Constants.BREAK + isDiagonal + Constants.BREAK + Constants.HDFS_SEQ_FILENAME; vWriter.append(new LongWritable(row * blockSize + column), new Text(value)); vWriter.close(); } } } }
From source file:edu.ucsb.cs.hybrid.io.Splitter.java
License:Apache License
/** * Checks input files and picks one with the requested S_size. * @param job : job configuration./* w w w. j a va 2 s . com*/ * @param inputPath: path to contain the one map file. * @param othersPath: other path that contains the whole input. * @param S_size: s vectors put into one map file. */ public static void createOneMapFile(JobConf job, Path inputPath, Path othersPath, long S_size) throws IOException { FileStatus[] files = hdfs.listStatus(othersPath); for (int i = 0; i < files.length; i++) { if (Collector.countFileVectors(hdfs, files[i].getPath(), job) >= S_size) { SequenceFile.Reader reader = new SequenceFile.Reader(hdfs, files[i].getPath(), job); SequenceFile.Writer writer = SequenceFile.createWriter(hdfs, job, new Path(inputPath.getName() + "/" + files[i].getPath().getName()), LongWritable.class, FeatureWeightArrayWritable.class, SequenceFile.CompressionType.NONE); long vCount = -1; while (reader.next(key, value) && (++vCount) < S_size) writer.append(key, value); writer.close(); return; } } throw new UnsupportedEncodingException("S_size requested is larger than each file !"); }
From source file:edu.ucsb.cs.hybrid.io.Splitter.java
License:Apache License
/** * splits the files in the input directory into at most s vectors * each. It does not combine the vectors from two different partitions. * @param job : configurations.//from w w w .ja va 2 s .c o m * @param S_size : split files into at most this size of vectors. * @param inputPath : path of the directory of the input files. * @return path of the splitted files with each at most s vectors. */ public static Path splitAll(JobConf job, long S_size, Path inputPath) throws IOException { System.out.println( "Splitter.splitAll() from " + inputPath.getName() + " into partitions of size at most " + S_size); LongWritable key = new LongWritable(); FeatureWeightArrayWritable value = new FeatureWeightArrayWritable(); SequenceFile.Writer writer = null; String tmpDir = "splits-tmp"; hdfs.delete(new Path(tmpDir), true); hdfs.mkdirs(new Path(tmpDir)); FileStatus[] files = Partitioner.setFiles(hdfs, inputPath); for (int i = 0; i < files.length; i++) { if ((hdfs.isDirectory(files[i].getPath()) || files[i].getPath().getName().startsWith("_"))) continue; SequenceFile.Reader reader = new SequenceFile.Reader(hdfs, files[i].getPath(), job); long subpartition = 0, vecCount = 0; while (reader.next(key, value)) { vecCount++; if (vecCount == 1) { if (writer != null) writer.close(); subpartition++; writer = SequenceFile.createWriter(hdfs, job, new Path(tmpDir + "/" + files[i].getPath().getName() + "-" + subpartition), LongWritable.class, FeatureWeightArrayWritable.class, SequenceFile.CompressionType.NONE); } writer.append(key, value); if (vecCount == S_size) vecCount = 0; } } writer.close(); return new Path(tmpDir); }
From source file:edu.ucsb.cs.lsh.minhash.MinHashLshDriver.java
License:Apache License
public static void writeLsh(JobConf job, FileSystem fs, LshTable lshTable) { try {//from w ww . ja v a 2 s.co m Path lshfile = new Path("lshfile"); NullWritable none = NullWritable.get(); if (fs.exists(lshfile)) fs.delete(lshfile); SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, lshfile, LshTable.class, NullWritable.class, SequenceFile.CompressionType.NONE); writer.append(lshTable, none); writer.close(); DistributedCache.addCacheFile(new URI("lshfile"), job); } catch (Exception e) { e.printStackTrace(); } }
From source file:edu.ucsb.cs.lsh.statistics.LshStat.java
License:Apache License
public static void convertInput(String[] args) throws IOException { if (args.length != 3) printUsage(2);/*from ww w . ja va2 s. c o m*/ String strLine, input = args[1], output_file = args[2]; Path outPath = new Path(output_file); Configuration conf = new Configuration(); FileSystem fs = outPath.getFileSystem(conf); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outPath, DocDocWritable.class, FloatWritable.class, SequenceFile.CompressionType.NONE); if ((new File(input)).isDirectory()) { for (File inputFile : (new File(input)).listFiles()) { BufferedReader br = new BufferedReader( new InputStreamReader(new DataInputStream(new FileInputStream(inputFile)))); while ((strLine = br.readLine()) != null) { writer.append(new DocDocWritable(0, 3), new FloatWritable(1)); } } } else { } writer.close(); }
From source file:edu.ucsb.cs.lsh.statistics.LshStat.java
License:Apache License
public static void produceMaxBucket(String args[]) throws IOException { if (args.length == 3) maxBucketID = Integer.parseInt(args[2]); else if (args.length != 2) printUsage(4);/*from w w w . ja v a 2 s.c o m*/ Path inputPath = new Path(args[1]); Path outPath = new Path("maxBucket"); Configuration conf = new Configuration(); FileSystem fs = inputPath.getFileSystem(conf); if (fs.exists(outPath)) fs.delete(outPath); FileStatus[] files = fs.listStatus(inputPath); SequenceFile.Writer writer = null; int bucketCount = 0; for (FileStatus file : files) { if ((fs.isDirectory(file.getPath())) || file.getPath().getName().startsWith("_")) continue; Reader reader = new SequenceFile.Reader(fs, file.getPath(), conf); LongWritable key = new LongWritable(); FeatureWeightArrayWritable value = new FeatureWeightArrayWritable(); while (reader.next(key, value)) if (key.get() == 0) { bucketCount++; if (bucketCount == maxBucketID) { writer = SequenceFile.createWriter(fs, conf, outPath, LongWritable.class, FeatureWeightArrayWritable.class, SequenceFile.CompressionType.NONE); while (reader.next(key, value) && (key.get() != 0)) writer.append(key, value); writer.close(); return; } } } }