List of usage examples for org.apache.mahout.text.wikipedia XmlInputFormat START_TAG_KEY
String START_TAG_KEY
To view the source code for org.apache.mahout.text.wikipedia XmlInputFormat START_TAG_KEY.
Click Source Link
From source file:se.lth.cs.koshik.util.Import.java
License:Open Source License
@SuppressWarnings("static-access") @Override// w ww . j av a 2 s. co m public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path,...").hasArg().withDescription("input path[s]") .create(OPTION_INPUTPATHS)); options.addOption(OptionBuilder.withArgName("text|conll2006|conll2009|wikipedia").hasArg() .withDescription("input format").create(OPTION_INPUTFORMAT)); options.addOption(OptionBuilder.withArgName("ISO 639-3").hasArg().withDescription("input language") .create(OPTION_INPUTLANGUAGE)); options.addOption(OptionBuilder.withArgName("charset").hasArg().withDescription("input charset") .create(OPTION_INPUTCHARSET)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path") .create(OPTION_OUTPUTPATH)); CommandLine commandLine; CommandLineParser commandLineParser = new GnuParser(); commandLine = commandLineParser.parse(options, args); if (!commandLine.hasOption(OPTION_INPUTPATHS) || !commandLine.hasOption(OPTION_INPUTFORMAT) || !commandLine.hasOption(OPTION_INPUTLANGUAGE) || !commandLine.hasOption(OPTION_OUTPUTPATH)) { HelpFormatter helpFormatter = new HelpFormatter(); helpFormatter.printHelp(getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPaths = commandLine.getOptionValue(OPTION_INPUTPATHS); String inputFormat = commandLine.getOptionValue(OPTION_INPUTFORMAT).toLowerCase(); String language = commandLine.getOptionValue(OPTION_INPUTLANGUAGE).toLowerCase(); Charset charset = Charset.forName( commandLine.hasOption(OPTION_INPUTCHARSET) ? commandLine.getOptionValue(OPTION_INPUTCHARSET) : "UTF-8"); Path outputPath = new Path(commandLine.getOptionValue(OPTION_OUTPUTPATH)); if (!(inputFormat.equals("text") || inputFormat.equals("conll2006") || inputFormat.equals("conll2009") || inputFormat.equals("wikipedia"))) { System.err.println( "Error: " + OPTION_INPUTFORMAT + " must be one of: text, conll2006, conll2009, wikipedia"); return -1; } LOGGER.info("Utility name: " + this.getClass().getName()); LOGGER.info(" - input path: " + inputPaths); LOGGER.info(" - input format: " + inputFormat); LOGGER.info(" - input charset: " + charset.displayName()); LOGGER.info(" - input language: " + language); LOGGER.info(" - output path: " + outputPath); Job job = new Job(getConf(), getClass().getName()); job.setJarByClass(getClass()); job.getConfiguration().set(OPTION_INPUTLANGUAGE, language); job.getConfiguration().set(OPTION_INPUTCHARSET, charset.toString()); FileInputFormat.setInputPaths(job, inputPaths); FileOutputFormat.setOutputPath(job, outputPath); if (inputFormat.equals("text")) { job.setInputFormatClass(WholeTextFileInputFormat.class); WholeTextFileInputFormat.setCharset(charset); job.setMapperClass(TextFileImportMapper.class); } else if (inputFormat.equals("conll2006")) { job.setInputFormatClass(WholeTextFileInputFormat.class); WholeTextFileInputFormat.setCharset(charset); job.setMapperClass(CoNLL2006FileImportMapper.class); } else if (inputFormat.equals("conll2009")) { job.setInputFormatClass(WholeTextFileInputFormat.class); WholeTextFileInputFormat.setCharset(charset); job.setMapperClass(CoNLL2009FileImportMapper.class); } else if (inputFormat.equals("wikipedia")) { job.getConfiguration().set(XmlInputFormat.START_TAG_KEY, "<page>"); job.getConfiguration().set(XmlInputFormat.END_TAG_KEY, "</page>"); job.setInputFormatClass(XmlInputFormat.class); job.setMapperClass(WikipediaImportMapper.class); } else { System.err.println("Error: " + OPTION_INPUTFORMAT + ": " + inputFormat + " is not supported!"); return -1; } AvroJob.setMapOutputKeySchema(job, AvroDocument.SCHEMA$); job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(Reducer.class); AvroJob.setOutputKeySchema(job, AvroDocument.SCHEMA$); job.setOutputValueClass(NullWritable.class); job.setOutputFormatClass(AvroKeyOutputFormat.class); return job.waitForCompletion(true) ? 0 : 1; }