Example usage for org.apache.hadoop.io SequenceFile createWriter

List of usage examples for org.apache.hadoop.io SequenceFile createWriter

Introduction

In this page you can find the example usage for org.apache.hadoop.io SequenceFile createWriter.

Prototype

@Deprecated
public static Writer createWriter(Configuration conf, FSDataOutputStream out, Class keyClass, Class valClass,
        CompressionType compressionType, CompressionCodec codec) throws IOException 

Source Link

Document

Construct the preferred type of 'raw' SequenceFile Writer.

Usage

From source file:com.sequenceiq.yarntest.mr.QuasiMonteCarlo.java

License:Apache License

/**
 * Run a map/reduce job for estimating Pi.
 *
 * @return the estimated value of Pi//from   ww w .  j av  a2 s . c om
 */
public static JobID submitPiEstimationMRApp(String jobName, int numMaps, long numPoints, Path tmpDir,
        Configuration conf) throws IOException, ClassNotFoundException, InterruptedException {
    Job job = new Job(conf);
    //setup job conf
    job.setJobName(jobName);
    job.setJarByClass(QuasiMonteCarlo.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setOutputKeyClass(BooleanWritable.class);
    job.setOutputValueClass(LongWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapperClass(QmcMapper.class);

    job.setReducerClass(QmcReducer.class);
    job.setNumReduceTasks(1);

    // turn off speculative execution, because DFS doesn't handle
    // multiple writers to the same file.
    job.setSpeculativeExecution(false);

    //setup input/output directories
    final Path inDir = new Path(tmpDir, "in");
    final Path outDir = new Path(tmpDir, "out");
    FileInputFormat.setInputPaths(job, inDir);
    FileOutputFormat.setOutputPath(job, outDir);

    final FileSystem fs = FileSystem.get(conf);
    if (fs.exists(tmpDir)) {
        fs.delete(tmpDir, true);
        //      throw new IOException("Tmp directory " + fs.makeQualified(tmpDir)
        //          + " already exists.  Please remove it first.");
    }
    if (!fs.mkdirs(inDir)) {
        throw new IOException("Cannot create input directory " + inDir);
    }

    //  try {
    //generate an input file for each map task
    for (int i = 0; i < numMaps; ++i) {
        final Path file = new Path(inDir, "part" + i);
        final LongWritable offset = new LongWritable(i * numPoints);
        final LongWritable size = new LongWritable(numPoints);
        final SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, LongWritable.class,
                LongWritable.class, CompressionType.NONE);
        try {
            writer.append(offset, size);
        } finally {
            writer.close();
        }
        System.out.println("Wrote input for Map #" + i);
    }

    //start a map/reduce job
    System.out.println("Starting Job");
    final long startTime = System.currentTimeMillis();
    job.submit();
    //      final double duration = (System.currentTimeMillis() - startTime)/1000.0;
    //      System.out.println("Job Finished in " + duration + " seconds");
    return job.getJobID();

    //    } finally {
    //      fs.delete(tmpDir, true);
    //    }
}

From source file:com.test.PiEstimatorKrb.java

License:Apache License

/**
 * Run a map/reduce job for estimating Pi.
 *
 * @return the estimated value of Pi//from   w  w w  .ja va 2 s.  c o m
 */
public static BigDecimal estimate(int numMaps, long numPoints, JobConf jobConf) throws IOException {
    //setup job conf
    jobConf.setJobName(PiEstimatorKrb.class.getSimpleName());

    jobConf.setInputFormat(SequenceFileInputFormat.class);

    jobConf.setOutputKeyClass(BooleanWritable.class);
    jobConf.setOutputValueClass(LongWritable.class);
    jobConf.setOutputFormat(SequenceFileOutputFormat.class);

    jobConf.setMapperClass(PiMapper.class);
    jobConf.setNumMapTasks(numMaps);

    jobConf.setReducerClass(PiReducer.class);
    jobConf.setNumReduceTasks(1);

    // turn off speculative execution, because DFS doesn't handle
    // multiple writers to the same file.
    jobConf.setSpeculativeExecution(false);

    //setup input/output directories
    final Path inDir = new Path(TMP_DIR, "in");
    final Path outDir = new Path(TMP_DIR, "out");
    FileInputFormat.setInputPaths(jobConf, inDir);
    FileOutputFormat.setOutputPath(jobConf, outDir);

    final FileSystem fs = FileSystem.get(jobConf);
    if (fs.exists(TMP_DIR)) {
        throw new IOException(
                "Tmp directory " + fs.makeQualified(TMP_DIR) + " already exists.  Please remove it first.");
    }
    if (!fs.mkdirs(inDir)) {
        throw new IOException("Cannot create input directory " + inDir);
    }

    try {
        //generate an input file for each map task
        for (int i = 0; i < numMaps; ++i) {
            final Path file = new Path(inDir, "part" + i);
            final LongWritable offset = new LongWritable(i * numPoints);
            final LongWritable size = new LongWritable(numPoints);
            final SequenceFile.Writer writer = SequenceFile.createWriter(fs, jobConf, file, LongWritable.class,
                    LongWritable.class, CompressionType.NONE);
            try {
                writer.append(offset, size);
            } finally {
                writer.close();
            }
            sLogger.info("Wrote input for Map #" + i);
        }

        //start a map/reduce job
        sLogger.info("Starting Job");
        final long startTime = System.currentTimeMillis();

        if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
            jobConf.set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
        }

        JobClient.runJob(jobConf);
        final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
        sLogger.info("Job Finished in " + duration + " seconds");

        //read outputs
        Path inFile = new Path(outDir, "reduce-out");
        LongWritable numInside = new LongWritable();
        LongWritable numOutside = new LongWritable();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, jobConf);
        try {
            reader.next(numInside, numOutside);
        } finally {
            reader.close();
        }

        //compute estimated value
        return BigDecimal.valueOf(4).setScale(20).multiply(BigDecimal.valueOf(numInside.get()))
                .divide(BigDecimal.valueOf(numMaps)).divide(BigDecimal.valueOf(numPoints));
    } finally {
        fs.delete(TMP_DIR, true);
    }
}

From source file:edu.berkeley.chukwa_xtrace.TestXtrExtract.java

License:Apache License

public void writeASinkFile(Configuration conf, FileSystem fileSys, Path dest, int chunks) throws IOException {
    FSDataOutputStream out = fileSys.create(dest);

    SequenceFile.Writer seqFileWriter = SequenceFile.createWriter(conf, out, ChukwaArchiveKey.class,
            ChunkImpl.class, SequenceFile.CompressionType.NONE, null);

    //FIXME: do write here

    seqFileWriter.close();//  w  w w  .  j  av a2s  .  c  o  m
    out.close();
}

From source file:edu.brown.cs.mapreduce.generator.DataLoader.java

License:Open Source License

/**
 * @param args/* www  . j a va  2 s .c o m*/
 */
public static void main(String[] args) {
    List<String> otherArgs = new ArrayList<String>();
    for (int i = 0; i < args.length; i++) {
        if ("-compress".equals(args[i])) {
            DataLoader.compress = true;
            DataLoader.sequence = true;
        } else if ("-sequence".equals(args[i])) {
            DataLoader.sequence = true;
        } else if ("-tuple".equals(args[i])) {
            DataLoader.tuple = true;
        } else if ("-local".equals(args[i])) {
            DataLoader.local = true;
        } else if ("-limit".equals(args[i])) {
            DataLoader.limit = Integer.parseInt(args[++i]);
        } else if ("-xargs".equals(args[i])) {
            DataLoader.xargs = true;
        } else if ("-debug".equals(args[i])) {
            DataLoader.debug = true;
        } else {
            otherArgs.add(args[i]);
        }
    } // FOR

    if (otherArgs.size() < 3 && !DataLoader.xargs) {
        System.err.println("USAGE: DataLoader <input type> <input file> <output file>");
        System.exit(1);
    }

    String input_type = otherArgs.get(0).toLowerCase();
    String input_file = otherArgs.get(1);
    String output_file = null;
    if (DataLoader.xargs) {
        output_file = input_file + ".dl";
    } else {
        output_file = otherArgs.get(2);
    }

    boolean valid = false;
    for (String type : DataLoader.VALID_TYPES) {
        if (type.equals(input_type)) {
            valid = true;
            break;
        }
    }
    if (!valid) {
        System.err.println("ERROR: Invalid input data type '" + input_type + "'");
        System.exit(1);
    }

    if (debug) {
        System.out.println("Input Type:  " + input_type);
        System.out.println("Input File:  " + input_file);
        System.out.println("Output File: " + output_file);
        System.out.println("Limit:       " + DataLoader.limit);
        System.out.println("Local:       " + DataLoader.local);
        System.out.println("XArgs:       " + DataLoader.xargs);
    }

    //
    // Get HDFS filesystem object that we can use for writing
    //
    FileSystem fs = null;
    Configuration conf = null;
    if (!DataLoader.local) {
        conf = AbstractHadoopClient.getConfiguration();
        try {
            fs = FileSystem.get(conf);
        } catch (Exception ex) {
            ex.printStackTrace();
            System.exit(-1);
        }
        if (debug)
            System.out.println("fs.default.name: " + conf.get("fs.default.name"));
    }

    //
    // Now open the file that we want to read and start writing the contents to our file system
    // For some things, like 'urls' we will want reverse the order so that the data makes sense
    // in our key->value paradigm
    //
    BufferedReader in = null;
    DataOutputStream out = null;
    SequenceFile.Writer writer = null;
    int lines = 0;
    try {
        if (input_file.equals("-")) {
            in = new BufferedReader(new InputStreamReader(System.in));
        } else {
            in = new BufferedReader(new FileReader(input_file));
        }
    } catch (FileNotFoundException ex) {
        System.err.println("ERROR: The input file '" + input_file + "' was not found : " + ex.getMessage());
        System.exit(1);
    }
    try {
        if (!DataLoader.local) {
            //
            // FileSystem Writer
            //
            if (!DataLoader.sequence) {
                out = fs.create(new Path(output_file));
                //
                // SequenceFile Writer
                //
            } else {
                if (input_type.equals("sortgrep"))
                    DataLoader.tuple = false;
                if (DataLoader.debug)
                    System.out.print("Creating " + (DataLoader.compress ? "compressed " : "")
                            + "SequenceFile.Writer for '" + output_file + "': ");
                Class<? extends Writable> key_class = Text.class;
                Class<? extends Writable> value_class = null;
                if (DataLoader.tuple) {
                    if (input_type.equals("uservisits"))
                        value_class = UserVisitsTuple.class;
                    if (input_type.equals("rankings"))
                        value_class = RankingsTuple.class;
                } else {
                    value_class = Text.class;
                }
                writer = SequenceFile.createWriter(fs, conf, new Path(output_file), key_class, value_class,
                        (DataLoader.compress ? SequenceFile.CompressionType.BLOCK
                                : SequenceFile.CompressionType.NONE));
                if (DataLoader.debug)
                    System.out.println("DONE!");
            }
            //
            // Local Filesystem
            //
        } else {
            out = new DataOutputStream(new FileOutputStream(output_file, true));
        }
    } catch (IOException ex) {
        System.err.println("ERROR: Failed to open output file '" + output_file + "' : " + ex.getMessage());
        System.exit(1);
    }
    try {
        //
        // Now read in each line of the input file and append it to our output
        //
        while (in.ready()) {
            //
            // Ignore any misformated lines
            //
            String line = null;
            String key = "";
            String value = "";
            try {
                line = in.readLine();
                String data[] = line.split("\\" + BenchmarkBase.VALUE_DELIMITER);
                //
                // Switch the two values in a rankings record
                //
                if (input_type.equals("rankings")) {
                    key = data[1];
                    value = data[0];
                    for (int i = 2; i < data.length; i++) {
                        value += BenchmarkBase.VALUE_DELIMITER + data[i];
                    } // FOR
                    //
                    // Change the comma to a tab
                    //
                } else if (input_type.equals("convert") || input_type.equals("uservisits")) {
                    key = data[0];
                    for (int i = 1; i < data.length; i++) {
                        if (i != 1)
                            value += BenchmarkBase.VALUE_DELIMITER;
                        value += data[i];
                    } // FOR
                    //
                    // Don't do anything with the SortGrep data!
                    //
                } else if (input_type.equals("sortgrep")) {
                    key = line.substring(0, 10);
                    value = line.substring(10);
                    //
                    // All others need to switch the first VALUE_DELIMITER to a KEYVALUE_DELIMITER
                    //   
                } else {
                    line = line.replaceFirst(BenchmarkBase.VALUE_DELIMITER, BenchmarkBase.KEYVALUE_DELIMITER);
                }
                if (DataLoader.local || !DataLoader.sequence) {
                    line = key + BenchmarkBase.KEYVALUE_DELIMITER + value + "\n";
                    out.write(line.getBytes());
                } else {
                    //if (DataLoader.debug) System.out.println("[" + lines + "] " + key + " => " + value);
                    if (DataLoader.tuple) {
                        try {
                            data = value.split("\\" + BenchmarkBase.VALUE_DELIMITER);
                            Writable tuple_values[] = new Writable[data.length];
                            Class<?> types[] = (input_type.equals("uservisits") ? BenchmarkBase.USERVISITS_TYPES
                                    : BenchmarkBase.RANKINGS_TYPES);
                            for (int ctr = 0; ctr < data.length; ctr++) {
                                //
                                // Important! You have to subtract one from the types list
                                // because the first one is really the key, but we're creating a tuple
                                // on just the values!!
                                //
                                if (types[ctr + 1] == Text.class) {
                                    tuple_values[ctr] = new Text(data[ctr]);
                                } else if (types[ctr + 1] == IntWritable.class) {
                                    tuple_values[ctr] = new IntWritable(Integer.valueOf(data[ctr]));
                                } else if (types[ctr + 1] == DoubleWritable.class) {
                                    tuple_values[ctr] = new DoubleWritable(Double.valueOf(data[ctr]));
                                } else if (types[ctr + 1] == LongWritable.class) {
                                    tuple_values[ctr] = new LongWritable(Long.valueOf(data[ctr]));
                                } else if (types[ctr + 1] == FloatWritable.class) {
                                    tuple_values[ctr] = new FloatWritable(Float.valueOf(data[ctr]));
                                } else {
                                    System.err.println("Unsupported Class: " + types[ctr + 1]);
                                    System.exit(1);
                                }
                                if (DataLoader.debug)
                                    System.out.println("tuple_values[" + ctr + "] = " + tuple_values[ctr]);
                            }
                            AbstractTuple tuple = (input_type.equals("uservisits")
                                    ? new UserVisitsTuple(tuple_values)
                                    : new RankingsTuple(tuple_values));
                            if (DataLoader.debug)
                                System.out.println("STORING TUPLE: " + tuple + " (DATA " + data + " | VALUE "
                                        + value + ")");
                            writer.append(new Text(key), tuple);
                        } catch (Exception ex) {
                            ex.printStackTrace();
                            System.err.println("Error[" + output_file + "]");
                            System.err.println("## Line:    " + lines);
                            System.err.println("## Content: " + line);
                        }
                    } else {
                        writer.append(new Text(key), new Text(value));
                    }
                }
                lines++;
                if (DataLoader.limit != null && lines >= DataLoader.limit)
                    break;
                if (DataLoader.debug && lines % 1000000 == 0)
                    System.out.println(
                            "\tWrote " + lines + " '" + input_type + "' records to '" + output_file + "'");
            } catch (Exception ex) {
                System.err.println("Error[" + output_file + "]");
                System.err.println("## Line:    " + lines);
                System.err.println("## Content: " + line);
                ex.printStackTrace();
                System.exit(1);
            }
        } // WHILE
    } catch (Exception ex) {
        ex.printStackTrace();
        System.exit(1);
    } finally {
        try {
            if (in != null)
                in.close();
            if (out != null)
                out.close();
            if (writer != null)
                writer.close();
        } catch (Exception ex) {
            ex.printStackTrace();
            System.exit(1);
        }
    }
    System.out.println("Wrote " + lines + " '" + input_type + "' records to '" + output_file + "'");
}

From source file:edu.indiana.soic.ts.mapreduce.pwd.PairWiseDistance.java

License:Open Source License

private void distributeData(int blockSize, Configuration conf, FileSystem fs, Path hdInputDir,
        int noOfDivisions) throws IOException {
    // Writing block meta data to for each block in a separate file so that
    // Hadoop will create separate Map tasks for each block..
    // Key : block number
    // Value: row#column#isDiagonal#base_file_name
    // TODO : find a better way to do this.
    for (int row = 0; row < noOfDivisions; row++) {
        for (int column = 0; column < noOfDivisions; column++) {
            // using the load balancing algorithm to select the blocks
            // include the diagonal blocks as they are blocks, not
            // individual pairs
            if (((row >= column) & ((row + column) % 2 == 0)) | ((row <= column) & ((row + column) % 2 == 1))) {
                Path vFile = new Path(hdInputDir, "data_file_" + row + "_" + column);
                SequenceFile.Writer vWriter = SequenceFile.createWriter(fs, conf, vFile, LongWritable.class,
                        Text.class, CompressionType.NONE);

                boolean isDiagonal = false;
                if (row == column) {
                    isDiagonal = true;//from www.j a v  a 2s  .  c  om
                }
                String value = row + Constants.BREAK + column + Constants.BREAK + isDiagonal + Constants.BREAK
                        + Constants.HDFS_SEQ_FILENAME;
                vWriter.append(new LongWritable(row * blockSize + column), new Text(value));
                vWriter.close();
            }
        }
    }
}

From source file:edu.ucsb.cs.hybrid.io.Splitter.java

License:Apache License

/**
 * Checks input files and picks one with the requested S_size.
 * @param job : job configuration./* w w w.  j  a va 2  s . com*/
 * @param inputPath: path to contain the one map file.
 * @param othersPath: other path that contains the whole input.
 * @param S_size: s vectors put into one map file.
 */
public static void createOneMapFile(JobConf job, Path inputPath, Path othersPath, long S_size)
        throws IOException {
    FileStatus[] files = hdfs.listStatus(othersPath);
    for (int i = 0; i < files.length; i++) {
        if (Collector.countFileVectors(hdfs, files[i].getPath(), job) >= S_size) {
            SequenceFile.Reader reader = new SequenceFile.Reader(hdfs, files[i].getPath(), job);
            SequenceFile.Writer writer = SequenceFile.createWriter(hdfs, job,
                    new Path(inputPath.getName() + "/" + files[i].getPath().getName()), LongWritable.class,
                    FeatureWeightArrayWritable.class, SequenceFile.CompressionType.NONE);

            long vCount = -1;
            while (reader.next(key, value) && (++vCount) < S_size)
                writer.append(key, value);
            writer.close();
            return;
        }
    }
    throw new UnsupportedEncodingException("S_size requested is larger than each file !");
}

From source file:edu.ucsb.cs.hybrid.io.Splitter.java

License:Apache License

/**
 * splits the files in the input directory into at most s vectors
 * each. It does not combine the vectors from two different partitions.
 * @param job : configurations.//from w  w  w .ja  va  2  s  .c  o m
 * @param S_size : split files into at most this size of vectors.
 * @param inputPath : path of the directory of the input files.
 * @return path of the splitted files with each at most s vectors.
 */
public static Path splitAll(JobConf job, long S_size, Path inputPath) throws IOException {

    System.out.println(
            "Splitter.splitAll() from " + inputPath.getName() + " into partitions of size at most " + S_size);
    LongWritable key = new LongWritable();
    FeatureWeightArrayWritable value = new FeatureWeightArrayWritable();
    SequenceFile.Writer writer = null;

    String tmpDir = "splits-tmp";
    hdfs.delete(new Path(tmpDir), true);
    hdfs.mkdirs(new Path(tmpDir));

    FileStatus[] files = Partitioner.setFiles(hdfs, inputPath);
    for (int i = 0; i < files.length; i++) {
        if ((hdfs.isDirectory(files[i].getPath()) || files[i].getPath().getName().startsWith("_")))
            continue;
        SequenceFile.Reader reader = new SequenceFile.Reader(hdfs, files[i].getPath(), job);
        long subpartition = 0, vecCount = 0;

        while (reader.next(key, value)) {
            vecCount++;
            if (vecCount == 1) {
                if (writer != null)
                    writer.close();
                subpartition++;
                writer = SequenceFile.createWriter(hdfs, job,
                        new Path(tmpDir + "/" + files[i].getPath().getName() + "-" + subpartition),
                        LongWritable.class, FeatureWeightArrayWritable.class,
                        SequenceFile.CompressionType.NONE);

            }
            writer.append(key, value);
            if (vecCount == S_size)
                vecCount = 0;
        }
    }
    writer.close();
    return new Path(tmpDir);
}

From source file:edu.ucsb.cs.lsh.minhash.MinHashLshDriver.java

License:Apache License

public static void writeLsh(JobConf job, FileSystem fs, LshTable lshTable) {
    try {//from w ww .  ja  v a 2  s.co  m
        Path lshfile = new Path("lshfile");
        NullWritable none = NullWritable.get();
        if (fs.exists(lshfile))
            fs.delete(lshfile);
        SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, lshfile, LshTable.class,
                NullWritable.class, SequenceFile.CompressionType.NONE);
        writer.append(lshTable, none);
        writer.close();
        DistributedCache.addCacheFile(new URI("lshfile"), job);
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:edu.ucsb.cs.lsh.statistics.LshStat.java

License:Apache License

public static void convertInput(String[] args) throws IOException {

    if (args.length != 3)
        printUsage(2);/*from  ww  w  .  ja  va2 s.  c o m*/

    String strLine, input = args[1], output_file = args[2];
    Path outPath = new Path(output_file);
    Configuration conf = new Configuration();
    FileSystem fs = outPath.getFileSystem(conf);
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outPath, DocDocWritable.class,
            FloatWritable.class, SequenceFile.CompressionType.NONE);

    if ((new File(input)).isDirectory()) {
        for (File inputFile : (new File(input)).listFiles()) {
            BufferedReader br = new BufferedReader(
                    new InputStreamReader(new DataInputStream(new FileInputStream(inputFile))));

            while ((strLine = br.readLine()) != null) {
                writer.append(new DocDocWritable(0, 3), new FloatWritable(1));
            }
        }
    } else {
    }
    writer.close();
}

From source file:edu.ucsb.cs.lsh.statistics.LshStat.java

License:Apache License

public static void produceMaxBucket(String args[]) throws IOException {
    if (args.length == 3)
        maxBucketID = Integer.parseInt(args[2]);
    else if (args.length != 2)
        printUsage(4);/*from  w  w  w  .  ja v a  2 s.c  o m*/

    Path inputPath = new Path(args[1]);
    Path outPath = new Path("maxBucket");
    Configuration conf = new Configuration();
    FileSystem fs = inputPath.getFileSystem(conf);
    if (fs.exists(outPath))
        fs.delete(outPath);
    FileStatus[] files = fs.listStatus(inputPath);
    SequenceFile.Writer writer = null;
    int bucketCount = 0;

    for (FileStatus file : files) {
        if ((fs.isDirectory(file.getPath())) || file.getPath().getName().startsWith("_"))
            continue;

        Reader reader = new SequenceFile.Reader(fs, file.getPath(), conf);
        LongWritable key = new LongWritable();
        FeatureWeightArrayWritable value = new FeatureWeightArrayWritable();

        while (reader.next(key, value))
            if (key.get() == 0) {
                bucketCount++;
                if (bucketCount == maxBucketID) {
                    writer = SequenceFile.createWriter(fs, conf, outPath, LongWritable.class,
                            FeatureWeightArrayWritable.class, SequenceFile.CompressionType.NONE);
                    while (reader.next(key, value) && (key.get() != 0))
                        writer.append(key, value);
                    writer.close();
                    return;
                }
            }
    }
}