Example usage for org.apache.hadoop.util GenericOptionsParser printGenericCommandUsage

List of usage examples for org.apache.hadoop.util GenericOptionsParser printGenericCommandUsage

Introduction

In this page you can find the example usage for org.apache.hadoop.util GenericOptionsParser printGenericCommandUsage.

Prototype

public static void printGenericCommandUsage(PrintStream out) 

Source Link

Document

Print the usage message for generic command-line options supported.

Usage

From source file:$.WordCount.java

License:Open Source License

public int run(String[] args) throws Exception {

        if (args.length != 2) {
            System.err.println(//from w  w w . j  a va 2s. com
                    "Usage: hadoop jar ${artifactId}-${version}-job.jar" + " [generic options] input output");
            System.err.println();
            GenericOptionsParser.printGenericCommandUsage(System.err);
            return 1;
        }

        String inputPath = args[0];
        String outputPath = args[1];

        // Create an object to coordinate pipeline creation and execution.
        Pipeline pipeline = new MRPipeline(WordCount.class, getConf());

        // Reference a given text file as a collection of Strings.
        PCollection<String> lines = pipeline.readTextFile(inputPath);

        // Define a function that splits each line in a PCollection of Strings into
        // a PCollection made up of the individual words in the file.
        // The second argument sets the serialization format.
        PCollection<String> words = lines.parallelDo(new Tokenizer(), Writables.strings());

        // Take the collection of words and remove known stop words.
        PCollection<String> noStopWords = words.filter(new StopWordFilter());

        // The count method applies a series of Crunch primitives and returns
        // a map of the unique words in the input PCollection to their counts.
        PTable<String, Long> counts = noStopWords.count();

        // Instruct the pipeline to write the resulting counts to a text file.
        pipeline.writeTextFile(counts, outputPath);

        // Execute the pipeline as a MapReduce.
        PipelineResult result = pipeline.done();

        return result.succeeded() ? 0 : 1;
    }

From source file:cn.edu.bjtu.cit.recommender.Recommender.java

License:Apache License

@SuppressWarnings("unchecked")
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println();/*  ww  w  .  j  a  v a 2  s .  c om*/
        System.err.println("Usage: " + this.getClass().getName()
                + " [generic options] input output [profiling] [estimation] [clustersize]");
        System.err.println();
        printUsage();
        GenericOptionsParser.printGenericCommandUsage(System.err);

        return 1;
    }
    OptionParser parser = new OptionParser(args);

    Pipeline pipeline = new MRPipeline(Recommender.class, getConf());

    if (parser.hasOption(CLUSTER_SIZE)) {
        pipeline.getConfiguration().setInt(ClusterOracle.CLUSTER_SIZE,
                Integer.parseInt(parser.getOption(CLUSTER_SIZE).getValue()));
    }

    if (parser.hasOption(PROFILING)) {
        pipeline.getConfiguration().setBoolean(Profiler.IS_PROFILE, true);
        this.profileFilePath = parser.getOption(PROFILING).getValue();

    }

    if (parser.hasOption(ESTIMATION)) {
        estFile = parser.getOption(ESTIMATION).getValue();
        est = new Estimator(estFile, clusterSize);
    }

    if (parser.hasOption(OPT_REDUCE)) {
        pipeline.getConfiguration().setBoolean(OPT_REDUCE, true);
    }

    if (parser.hasOption(OPT_MSCR)) {
        pipeline.getConfiguration().setBoolean(OPT_MSCR, true);
    }

    if (parser.hasOption(ACTIVE_THRESHOLD)) {
        threshold = Integer.parseInt(parser.getOption("at").getValue());
    }

    if (parser.hasOption(TOP)) {
        top = Integer.parseInt(parser.getOption("top").getValue());
    }

    profiler = new Profiler(pipeline);
    /*
     * input node
     */
    PCollection<String> lines = pipeline.readTextFile(args[0]);

    if (profiler.isProfiling() && lines.getSize() > 10 * 1024 * 1024) {
        lines = lines.sample(0.1);
    }

    /*
     * S0 + GBK
     */
    PGroupedTable<Long, Long> userWithPrefs = lines.parallelDo(new MapFn<String, Pair<Long, Long>>() {

        @Override
        public Pair<Long, Long> map(String input) {
            String[] split = input.split(Estimator.DELM);
            long userID = Long.parseLong(split[0]);
            long itemID = Long.parseLong(split[1]);
            return Pair.of(userID, itemID);
        }

        @Override
        public float scaleFactor() {
            return est.getScaleFactor("S0").sizeFactor;
        }

        @Override
        public float scaleFactorByRecord() {
            return est.getScaleFactor("S0").recsFactor;
        }
    }, Writables.tableOf(Writables.longs(), Writables.longs())).groupByKey(est.getClusterSize());

    /*
     * S1
     */
    PTable<Long, Vector> userVector = userWithPrefs
            .parallelDo(new MapFn<Pair<Long, Iterable<Long>>, Pair<Long, Vector>>() {
                @Override
                public Pair<Long, Vector> map(Pair<Long, Iterable<Long>> input) {
                    Vector userVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);
                    for (long itemPref : input.second()) {
                        userVector.set((int) itemPref, 1.0f);
                    }
                    return Pair.of(input.first(), userVector);
                }

                @Override
                public float scaleFactor() {
                    return est.getScaleFactor("S1").sizeFactor;
                }

                @Override
                public float scaleFactorByRecord() {
                    return est.getScaleFactor("S1").recsFactor;
                }
            }, Writables.tableOf(Writables.longs(), Writables.vectors()));

    userVector = profiler.profile("S0-S1", pipeline, userVector, ProfileConverter.long_vector(),
            Writables.tableOf(Writables.longs(), Writables.vectors()));

    /*
     * S2
     */
    PTable<Long, Vector> filteredUserVector = userVector
            .parallelDo(new DoFn<Pair<Long, Vector>, Pair<Long, Vector>>() {

                @Override
                public void process(Pair<Long, Vector> input, Emitter<Pair<Long, Vector>> emitter) {
                    if (input.second().getNumNondefaultElements() > threshold) {
                        emitter.emit(input);
                    }
                }

                @Override
                public float scaleFactor() {
                    return est.getScaleFactor("S2").sizeFactor;
                }

                @Override
                public float scaleFactorByRecord() {
                    return est.getScaleFactor("S2").recsFactor;
                }

            }, Writables.tableOf(Writables.longs(), Writables.vectors()));

    filteredUserVector = profiler.profile("S2", pipeline, filteredUserVector, ProfileConverter.long_vector(),
            Writables.tableOf(Writables.longs(), Writables.vectors()));

    /*
     * S3 + GBK
     */
    PGroupedTable<Integer, Integer> coOccurencePairs = filteredUserVector
            .parallelDo(new DoFn<Pair<Long, Vector>, Pair<Integer, Integer>>() {
                @Override
                public void process(Pair<Long, Vector> input, Emitter<Pair<Integer, Integer>> emitter) {
                    Iterator<Vector.Element> it = input.second().iterateNonZero();
                    while (it.hasNext()) {
                        int index1 = it.next().index();
                        Iterator<Vector.Element> it2 = input.second().iterateNonZero();
                        while (it2.hasNext()) {
                            int index2 = it2.next().index();
                            emitter.emit(Pair.of(index1, index2));
                        }
                    }
                }

                @Override
                public float scaleFactor() {
                    float size = est.getScaleFactor("S3").sizeFactor;
                    return size;
                }

                @Override
                public float scaleFactorByRecord() {
                    float recs = est.getScaleFactor("S3").recsFactor;
                    return recs;
                }
            }, Writables.tableOf(Writables.ints(), Writables.ints())).groupByKey(est.getClusterSize());

    /*
     * S4
     */
    PTable<Integer, Vector> coOccurenceVector = coOccurencePairs
            .parallelDo(new MapFn<Pair<Integer, Iterable<Integer>>, Pair<Integer, Vector>>() {
                @Override
                public Pair<Integer, Vector> map(Pair<Integer, Iterable<Integer>> input) {
                    Vector cooccurrenceRow = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);
                    for (int itemIndex2 : input.second()) {
                        cooccurrenceRow.set(itemIndex2, cooccurrenceRow.get(itemIndex2) + 1.0);
                    }
                    return Pair.of(input.first(), cooccurrenceRow);
                }

                @Override
                public float scaleFactor() {
                    return est.getScaleFactor("S4").sizeFactor;
                }

                @Override
                public float scaleFactorByRecord() {
                    return est.getScaleFactor("S4").recsFactor;
                }
            }, Writables.tableOf(Writables.ints(), Writables.vectors()));

    coOccurenceVector = profiler.profile("S3-S4", pipeline, coOccurenceVector, ProfileConverter.int_vector(),
            Writables.tableOf(Writables.ints(), Writables.vectors()));

    /*
     * S5 Wrapping co-occurrence columns
     */
    PTable<Integer, VectorOrPref> wrappedCooccurrence = coOccurenceVector
            .parallelDo(new MapFn<Pair<Integer, Vector>, Pair<Integer, VectorOrPref>>() {

                @Override
                public Pair<Integer, VectorOrPref> map(Pair<Integer, Vector> input) {
                    return Pair.of(input.first(), new VectorOrPref(input.second()));
                }

                @Override
                public float scaleFactor() {
                    return est.getScaleFactor("S5").sizeFactor;
                }

                @Override
                public float scaleFactorByRecord() {
                    return est.getScaleFactor("S5").recsFactor;
                }

            }, Writables.tableOf(Writables.ints(), VectorOrPref.vectorOrPrefs()));

    wrappedCooccurrence = profiler.profile("S5", pipeline, wrappedCooccurrence, ProfileConverter.int_vopv(),
            Writables.tableOf(Writables.ints(), VectorOrPref.vectorOrPrefs()));

    /*
     * S6 Splitting user vectors
     */
    PTable<Integer, VectorOrPref> userVectorSplit = filteredUserVector
            .parallelDo(new DoFn<Pair<Long, Vector>, Pair<Integer, VectorOrPref>>() {

                @Override
                public void process(Pair<Long, Vector> input, Emitter<Pair<Integer, VectorOrPref>> emitter) {
                    long userID = input.first();
                    Vector userVector = input.second();
                    Iterator<Vector.Element> it = userVector.iterateNonZero();
                    while (it.hasNext()) {
                        Vector.Element e = it.next();
                        int itemIndex = e.index();
                        float preferenceValue = (float) e.get();
                        emitter.emit(Pair.of(itemIndex, new VectorOrPref(userID, preferenceValue)));
                    }
                }

                @Override
                public float scaleFactor() {
                    return est.getScaleFactor("S6").sizeFactor;
                }

                @Override
                public float scaleFactorByRecord() {
                    return est.getScaleFactor("S6").recsFactor;
                }
            }, Writables.tableOf(Writables.ints(), VectorOrPref.vectorOrPrefs()));

    userVectorSplit = profiler.profile("S6", pipeline, userVectorSplit, ProfileConverter.int_vopp(),
            Writables.tableOf(Writables.ints(), VectorOrPref.vectorOrPrefs()));

    /*
     * S7 Combine VectorOrPrefs
     */
    PTable<Integer, VectorAndPrefs> combinedVectorOrPref = wrappedCooccurrence.union(userVectorSplit)
            .groupByKey(est.getClusterSize())
            .parallelDo(new DoFn<Pair<Integer, Iterable<VectorOrPref>>, Pair<Integer, VectorAndPrefs>>() {

                @Override
                public void process(Pair<Integer, Iterable<VectorOrPref>> input,
                        Emitter<Pair<Integer, VectorAndPrefs>> emitter) {
                    Vector vector = null;
                    List<Long> userIDs = Lists.newArrayList();
                    List<Float> values = Lists.newArrayList();
                    for (VectorOrPref vop : input.second()) {
                        if (vector == null) {
                            vector = vop.getVector();
                        }
                        long userID = vop.getUserID();
                        if (userID != Long.MIN_VALUE) {
                            userIDs.add(vop.getUserID());
                        }
                        float value = vop.getValue();
                        if (!Float.isNaN(value)) {
                            values.add(vop.getValue());
                        }
                    }
                    emitter.emit(Pair.of(input.first(), new VectorAndPrefs(vector, userIDs, values)));
                }

                @Override
                public float scaleFactor() {
                    return est.getScaleFactor("S7").sizeFactor;
                }

                @Override
                public float scaleFactorByRecord() {
                    return est.getScaleFactor("S7").recsFactor;
                }
            }, Writables.tableOf(Writables.ints(), VectorAndPrefs.vectorAndPrefs()));

    combinedVectorOrPref = profiler.profile("S5+S6-S7", pipeline, combinedVectorOrPref,
            ProfileConverter.int_vap(), Writables.tableOf(Writables.ints(), VectorAndPrefs.vectorAndPrefs()));
    /*
     * S8 Computing partial recommendation vectors
     */
    PTable<Long, Vector> partialMultiply = combinedVectorOrPref
            .parallelDo(new DoFn<Pair<Integer, VectorAndPrefs>, Pair<Long, Vector>>() {
                @Override
                public void process(Pair<Integer, VectorAndPrefs> input, Emitter<Pair<Long, Vector>> emitter) {
                    Vector cooccurrenceColumn = input.second().getVector();
                    List<Long> userIDs = input.second().getUserIDs();
                    List<Float> prefValues = input.second().getValues();
                    for (int i = 0; i < userIDs.size(); i++) {
                        long userID = userIDs.get(i);
                        if (userID != Long.MIN_VALUE) {
                            float prefValue = prefValues.get(i);
                            Vector partialProduct = cooccurrenceColumn.times(prefValue);
                            emitter.emit(Pair.of(userID, partialProduct));
                        }
                    }
                }

                @Override
                public float scaleFactor() {
                    return est.getScaleFactor("S8").sizeFactor;
                }

                @Override
                public float scaleFactorByRecord() {
                    return est.getScaleFactor("S8").recsFactor;
                }

            }, Writables.tableOf(Writables.longs(), Writables.vectors())).groupByKey(est.getClusterSize())
            .combineValues(new CombineFn<Long, Vector>() {

                @Override
                public void process(Pair<Long, Iterable<Vector>> input, Emitter<Pair<Long, Vector>> emitter) {
                    Vector partial = null;
                    for (Vector vector : input.second()) {
                        partial = partial == null ? vector : partial.plus(vector);
                    }
                    emitter.emit(Pair.of(input.first(), partial));
                }

                @Override
                public float scaleFactor() {
                    return est.getScaleFactor("combine").sizeFactor;
                }

                @Override
                public float scaleFactorByRecord() {
                    return est.getScaleFactor("combine").recsFactor;
                }
            });

    partialMultiply = profiler.profile("S8-combine", pipeline, partialMultiply, ProfileConverter.long_vector(),
            Writables.tableOf(Writables.longs(), Writables.vectors()));

    /*
     * S9 Producing recommendations from vectors
     */
    PTable<Long, RecommendedItems> recommendedItems = partialMultiply
            .parallelDo(new DoFn<Pair<Long, Vector>, Pair<Long, RecommendedItems>>() {

                @Override
                public void process(Pair<Long, Vector> input, Emitter<Pair<Long, RecommendedItems>> emitter) {
                    Queue<RecommendedItem> topItems = new PriorityQueue<RecommendedItem>(11,
                            Collections.reverseOrder(BY_PREFERENCE_VALUE));
                    Iterator<Vector.Element> recommendationVectorIterator = input.second().iterateNonZero();
                    while (recommendationVectorIterator.hasNext()) {
                        Vector.Element element = recommendationVectorIterator.next();
                        int index = element.index();
                        float value = (float) element.get();
                        if (topItems.size() < top) {
                            topItems.add(new GenericRecommendedItem(index, value));
                        } else if (value > topItems.peek().getValue()) {
                            topItems.add(new GenericRecommendedItem(index, value));
                            topItems.poll();
                        }
                    }
                    List<RecommendedItem> recommendations = new ArrayList<RecommendedItem>(topItems.size());
                    recommendations.addAll(topItems);
                    Collections.sort(recommendations, BY_PREFERENCE_VALUE);
                    emitter.emit(Pair.of(input.first(), new RecommendedItems(recommendations)));
                }

                @Override
                public float scaleFactor() {
                    return est.getScaleFactor("S9").sizeFactor;
                }

                @Override
                public float scaleFactorByRecord() {
                    return est.getScaleFactor("S9").recsFactor;
                }

            }, Writables.tableOf(Writables.longs(), RecommendedItems.recommendedItems()));

    recommendedItems = profiler.profile("S9", pipeline, recommendedItems, ProfileConverter.long_ri(),
            Writables.tableOf(Writables.longs(), RecommendedItems.recommendedItems()));

    /*
     * Profiling
     */
    if (profiler.isProfiling()) {
        profiler.writeResultToFile(profileFilePath);
        profiler.cleanup(pipeline.getConfiguration());
        return 0;
    }
    /*
     * asText
     */
    pipeline.writeTextFile(recommendedItems, args[1]);
    PipelineResult result = pipeline.done();
    return result.succeeded() ? 0 : 1;
}

From source file:com.cloudera.castagna.crunch.AverageBytesByIP.java

License:Apache License

public int run(String[] args) throws Exception {

    if (args.length != 2) {
        System.err.println();/* ww w .j  ava 2s.c  o m*/
        System.err.println("Two and only two arguments are accepted.");
        System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output");
        System.err.println();
        GenericOptionsParser.printGenericCommandUsage(System.err);
        return 1;
    }

    Pipeline pipeline = new MRPipeline(AverageBytesByIP.class, getConf());
    PCollection<String> lines = pipeline.readTextFile(args[0]);
    Aggregator<Pair<Long, Long>> agg = pairAggregator(SUM_LONGS(), SUM_LONGS());

    PTable<String, Pair<Long, Long>> remoteAddrResponseSize = lines
            .parallelDo(extractResponseSize,
                    Writables.tableOf(Writables.strings(),
                            Writables.pairs(Writables.longs(), Writables.longs())))
            .groupByKey().combineValues(agg);

    PTable<String, Double> avgs = remoteAddrResponseSize.parallelDo(calulateAverage,
            Writables.tableOf(Writables.strings(), Writables.doubles()));
    pipeline.writeTextFile(avgs.top(100), args[1]);
    PipelineResult result = pipeline.done();

    return result.succeeded() ? 0 : 1;
}

From source file:com.cloudera.castagna.crunch.PageViews.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println();//from  w  w  w . j av a  2 s .c o m
        System.err.println("Two and only two arguments are accepted.");
        System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output");
        System.err.println();
        GenericOptionsParser.printGenericCommandUsage(System.err);
        return 1;
    }

    Pipeline pipeline = new MRPipeline(PageViews.class, getConf());
    PCollection<String> lines = pipeline.readTextFile(args[0]);
    CombineFn<String, Long> longSumCombiner = CombineFn.SUM_LONGS();

    PTable<String, Long> pageViews = lines
            .parallelDo(extractIPResponseSize, Writables.tableOf(Writables.strings(), Writables.longs()))
            .groupByKey().combineValues(longSumCombiner).top(200);

    pipeline.writeTextFile(pageViews, args[1]);
    PipelineResult result = pipeline.done();

    return result.succeeded() ? 0 : 1;
}

From source file:com.cloudera.castagna.crunch.TotalBytesByIP.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println();// ww  w. j av  a  2  s. com
        System.err.println("Two and only two arguments are accepted.");
        System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output");
        System.err.println();
        GenericOptionsParser.printGenericCommandUsage(System.err);
        return 1;
    }

    Pipeline pipeline = new MRPipeline(TotalBytesByIP.class, getConf());
    PCollection<String> lines = pipeline.readTextFile(args[0]);
    CombineFn<String, Long> longSumCombiner = CombineFn.SUM_LONGS();

    PTable<String, Long> ipAddrResponseSize = lines
            .parallelDo(extractIPResponseSize, Writables.tableOf(Writables.strings(), Writables.longs()))
            .groupByKey().combineValues(longSumCombiner).top(10);

    pipeline.writeTextFile(ipAddrResponseSize, args[1]);
    PipelineResult result = pipeline.done();

    return result.succeeded() ? 0 : 1;
}

From source file:com.cloudera.crunch.examples.AverageBytesByIP.java

License:Open Source License

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println();//from  w w w. j  av  a 2  s  .  co  m
        System.err.println("Two and only two arguments are accepted.");
        System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output");
        System.err.println();
        GenericOptionsParser.printGenericCommandUsage(System.err);
        return 1;
    }
    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(AverageBytesByIP.class, getConf());
    // Reference a given text file as a collection of Strings.
    PCollection<String> lines = pipeline.readTextFile(args[0]);

    // Combiner used for summing up response size and count
    CombineFn<String, Pair<Long, Long>> stringPairOfLongsSumCombiner = CombineFn
            .pairAggregator(CombineFn.SUM_LONGS, CombineFn.SUM_LONGS);

    // Table of (ip, sum(response size), count)
    PTable<String, Pair<Long, Long>> remoteAddrResponseSize = lines
            .parallelDo(extractResponseSize,
                    Writables.tableOf(Writables.strings(),
                            Writables.pairs(Writables.longs(), Writables.longs())))
            .groupByKey().combineValues(stringPairOfLongsSumCombiner);

    // Calculate average response size by ip address
    PTable<String, Double> avgs = remoteAddrResponseSize.parallelDo(calulateAverage,
            Writables.tableOf(Writables.strings(), Writables.doubles()));

    // write the result to a text file
    pipeline.writeTextFile(avgs, args[1]);
    // Execute the pipeline as a MapReduce.
    pipeline.done();
    return 0;
}

From source file:com.cloudera.crunch.examples.TotalBytesByIP.java

License:Open Source License

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println();/*from   w w w  .j  a v a  2 s  .  c om*/
        System.err.println("Two and only two arguments are accepted.");
        System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output");
        System.err.println();
        GenericOptionsParser.printGenericCommandUsage(System.err);
        return 1;
    }
    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(TotalBytesByIP.class, getConf());
    // Reference a given text file as a collection of Strings.
    PCollection<String> lines = pipeline.readTextFile(args[0]);

    // Combiner used for summing up response size
    CombineFn<String, Long> longSumCombiner = CombineFn.SUM_LONGS();

    // Table of (ip, sum(response size))
    PTable<String, Long> ipAddrResponseSize = lines
            .parallelDo(extractIPResponseSize, Writables.tableOf(Writables.strings(), Writables.longs()))
            .groupByKey().combineValues(longSumCombiner);

    pipeline.writeTextFile(ipAddrResponseSize, args[1]);
    // Execute the pipeline as a MapReduce.
    pipeline.done();
    return 0;
}

From source file:com.cloudera.fts.App.java

License:Open Source License

private void printUsage() {
    GenericOptionsParser.printGenericCommandUsage(System.err);
    System.err.println("Basic Usage: [avro,proto,text2pb,count] <inputdir> <outputdir>");
    System.exit(1);/*from  w w w .j  a  v  a2  s .c om*/
}

From source file:com.cloudera.fts.App.java

License:Open Source License

private void printAvroUsage() {
    GenericOptionsParser.printGenericCommandUsage(System.err);
    System.err.println(/*from  w w  w  .j  av a2 s .c  o  m*/
            "Avro requires one extra argument, the event file name: avro <inputdir> <events_file> <outputdir>");
    System.exit(1);
}

From source file:com.cloudera.hadoop.hdfs.nfs.nfs4.NFS4Server.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    int port;//from  w  ww  .ja  v  a  2 s  .c  o  m
    try {
        port = Integer.parseInt(args[0]);
    } catch (Exception e) {
        System.err.println(this.getClass().getName() + " port");
        GenericOptionsParser.printGenericCommandUsage(System.err);
        return 1;
    }
    start(null, port);
    while (mRPCServer.isAlive()) {
        Thread.sleep(10000L);
    }
    return 0;
}