Example usage for org.apache.mahout.text.wikipedia XmlInputFormat START_TAG_KEY

List of usage examples for org.apache.mahout.text.wikipedia XmlInputFormat START_TAG_KEY

Introduction

In this page you can find the example usage for org.apache.mahout.text.wikipedia XmlInputFormat START_TAG_KEY.

Prototype

String START_TAG_KEY

To view the source code for org.apache.mahout.text.wikipedia XmlInputFormat START_TAG_KEY.

Click Source Link

Usage

From source file:se.lth.cs.koshik.util.Import.java

License:Open Source License

@SuppressWarnings("static-access")
@Override//  w  ww  .  j av  a  2  s. co  m
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path,...").hasArg().withDescription("input path[s]")
            .create(OPTION_INPUTPATHS));
    options.addOption(OptionBuilder.withArgName("text|conll2006|conll2009|wikipedia").hasArg()
            .withDescription("input format").create(OPTION_INPUTFORMAT));
    options.addOption(OptionBuilder.withArgName("ISO 639-3").hasArg().withDescription("input language")
            .create(OPTION_INPUTLANGUAGE));
    options.addOption(OptionBuilder.withArgName("charset").hasArg().withDescription("input charset")
            .create(OPTION_INPUTCHARSET));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path")
            .create(OPTION_OUTPUTPATH));

    CommandLine commandLine;
    CommandLineParser commandLineParser = new GnuParser();
    commandLine = commandLineParser.parse(options, args);

    if (!commandLine.hasOption(OPTION_INPUTPATHS) || !commandLine.hasOption(OPTION_INPUTFORMAT)
            || !commandLine.hasOption(OPTION_INPUTLANGUAGE) || !commandLine.hasOption(OPTION_OUTPUTPATH)) {
        HelpFormatter helpFormatter = new HelpFormatter();
        helpFormatter.printHelp(getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPaths = commandLine.getOptionValue(OPTION_INPUTPATHS);
    String inputFormat = commandLine.getOptionValue(OPTION_INPUTFORMAT).toLowerCase();
    String language = commandLine.getOptionValue(OPTION_INPUTLANGUAGE).toLowerCase();
    Charset charset = Charset.forName(
            commandLine.hasOption(OPTION_INPUTCHARSET) ? commandLine.getOptionValue(OPTION_INPUTCHARSET)
                    : "UTF-8");
    Path outputPath = new Path(commandLine.getOptionValue(OPTION_OUTPUTPATH));

    if (!(inputFormat.equals("text") || inputFormat.equals("conll2006") || inputFormat.equals("conll2009")
            || inputFormat.equals("wikipedia"))) {
        System.err.println(
                "Error: " + OPTION_INPUTFORMAT + " must be one of: text, conll2006, conll2009, wikipedia");
        return -1;
    }

    LOGGER.info("Utility name: " + this.getClass().getName());
    LOGGER.info(" - input path: " + inputPaths);
    LOGGER.info(" - input format: " + inputFormat);
    LOGGER.info(" - input charset: " + charset.displayName());
    LOGGER.info(" - input language: " + language);
    LOGGER.info(" - output path: " + outputPath);

    Job job = new Job(getConf(), getClass().getName());
    job.setJarByClass(getClass());

    job.getConfiguration().set(OPTION_INPUTLANGUAGE, language);
    job.getConfiguration().set(OPTION_INPUTCHARSET, charset.toString());

    FileInputFormat.setInputPaths(job, inputPaths);
    FileOutputFormat.setOutputPath(job, outputPath);

    if (inputFormat.equals("text")) {
        job.setInputFormatClass(WholeTextFileInputFormat.class);
        WholeTextFileInputFormat.setCharset(charset);

        job.setMapperClass(TextFileImportMapper.class);
    } else if (inputFormat.equals("conll2006")) {
        job.setInputFormatClass(WholeTextFileInputFormat.class);
        WholeTextFileInputFormat.setCharset(charset);

        job.setMapperClass(CoNLL2006FileImportMapper.class);
    } else if (inputFormat.equals("conll2009")) {
        job.setInputFormatClass(WholeTextFileInputFormat.class);
        WholeTextFileInputFormat.setCharset(charset);

        job.setMapperClass(CoNLL2009FileImportMapper.class);
    } else if (inputFormat.equals("wikipedia")) {
        job.getConfiguration().set(XmlInputFormat.START_TAG_KEY, "<page>");
        job.getConfiguration().set(XmlInputFormat.END_TAG_KEY, "</page>");
        job.setInputFormatClass(XmlInputFormat.class);

        job.setMapperClass(WikipediaImportMapper.class);
    } else {
        System.err.println("Error: " + OPTION_INPUTFORMAT + ": " + inputFormat + " is not supported!");
        return -1;
    }

    AvroJob.setMapOutputKeySchema(job, AvroDocument.SCHEMA$);
    job.setMapOutputValueClass(NullWritable.class);

    job.setReducerClass(Reducer.class);
    AvroJob.setOutputKeySchema(job, AvroDocument.SCHEMA$);
    job.setOutputValueClass(NullWritable.class);
    job.setOutputFormatClass(AvroKeyOutputFormat.class);

    return job.waitForCompletion(true) ? 0 : 1;
}