Java tutorial
package wikiduper.application; /* * Cloud9: A MapReduce Library for Hadoop * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ import java.io.EOFException; import java.io.IOException; import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import org.wikiclean.WikiClean; import org.wikiclean.WikiClean.WikiLanguage; import org.wikiclean.WikiCleanBuilder; import wikiduper.utils.DocSentence; import wikiduper.wikipedia.WikipediaPage; import edu.umd.cloud9.io.array.ArrayListWritable; import edu.umd.cloud9.io.pair.PairOfLongString; import edu.umd.cloud9.io.pair.PairOfLongs; import edu.umd.cloud9.io.pair.PairOfStrings; public class GetSentenceClusters extends Configured implements Tool { private static final Logger LOG = Logger.getLogger(GetSentenceClusters.class); /** * ClusterMapper * * Reads in a map from docid -> sentence number -> cluster number as side data. * * Maps over wikipedia input looking for pages with the right docid and pull out the corresponding sentences. * * @author weissman * */ private static class ClusterMapper extends MapReduceBase implements Mapper<IntWritable, WikipediaPage, LongWritable, PairOfStrings> { //Mapper<LongWritable, WikipediaPage, IntWritable, Text> { // Map from docid -> sentence number -> cluster number static final TreeMap<Long, TreeMap<Long, Long>> docmap = new TreeMap<Long, TreeMap<Long, Long>>(); // The document-sentence identifier static final LongWritable CLUSTER = new LongWritable(); static final PairOfStrings TITLESENTENCE = new PairOfStrings(); //Adapted from http://stackoverflow.com/questions/5553410/regular-expression-match-a-sentence static final Pattern sentenceregex = Pattern.compile("# Match a sentence ending in punctuation or EOS.\n" + "[\\s]* # Leading white space\n" + "([A-Z\"] # First char capital letter or quotation\n" + "[^.!?\\n]* # Greedily consume up to punctuation.\n" + "(?: # Group for unrolling the loop.\n" + " [.!?] # (special) inner punctuation ok if\n" + " (?!['\"]?\\s|$) # not followed by ws or EOS.\n" + " [^.!?]* # Greedily consume up to punctuation.\n" + ")* # Zero or more (special normal*)\n" + "[.!?]? # Optional ending punctuation.\n" + "['\"]?) # Optional closing quote.\n" + "(\\s|\\n)*$? # Trailing white space or new line\n", Pattern.MULTILINE | Pattern.COMMENTS); public static WikiClean cleaner; public void map(IntWritable key, WikipediaPage p, OutputCollector<LongWritable, PairOfStrings> output, Reporter reporter) throws IOException { //public void map(LongWritable key, WikipediaPage p, OutputCollector<IntWritable, Text> output, // Reporter reporter) throws IOException { if (!p.isArticle() || p.isEmpty()) return; String raw = p.getRawXML(); String content = cleaner.clean(raw); //cleaner.getTitle(content); //String content = p.getContent(); if (content == null) return; String line = content //.replace("\n", " ") .replace(" ", " ").replace(",", "").replace("(b.", "(b").replace("(d.", "(d"); Matcher m = sentenceregex.matcher(line); // Assume a whole Wikipedia article has been passed to the mapper; track sentence number by counting long sentencect = 0; long id = Long.parseLong(p.getDocid()); if (!docmap.containsKey(id)) return; //System.out.println("Doc map contains id " + id); TreeMap<Long, Long> sentMap = docmap.get(id); try { // For each sentence in the input text: while (m.find()) { String sentence = m.group(1); if (sentMap.containsKey(sentencect)) { long clust = sentMap.get(sentencect); TITLESENTENCE.set(p.getTitle(), sentence); CLUSTER.set(clust); output.collect(CLUSTER, TITLESENTENCE); } sentencect++; } } catch (Throwable e) { System.err.println("WARNING: Possible stack overflow from regex at docid " + p.getDocid() + " and sentence # " + sentencect); } //System.out.println("Max sentence ct " + sentencect); } public void configure(JobConf job) { String docMapFile = job.get("docmapfile"); String language = job.get("wiki.language", "en"); WikiLanguage wikilang = WikiLanguage.valueOf(language.toUpperCase()); cleaner = new WikiCleanBuilder().withLanguage(wikilang).withTitle(true).withFooter(false).build(); try { FileSystem fs = FileSystem.get(job); FSDataInputStream in = fs.open(new Path(docMapFile)); SequenceFile.Reader reader; reader = new SequenceFile.Reader(job, SequenceFile.Reader.stream(in)); IntWritable cluster = new IntWritable(); ArrayListWritable<DocSentence> sentlist = new ArrayListWritable<DocSentence>(); while (reader.next(cluster, sentlist)) { for (DocSentence ds : sentlist) { if (!docmap.containsKey(ds.getId())) { docmap.put(ds.getId(), new TreeMap<Long, Long>()); } if (docmap.get(ds.getId()).containsKey(ds.getSentence())) { System.out.println("Sentence in more than one cluster: " + ds); } docmap.get(ds.getId()).put((long) ds.getSentence(), (long) cluster.get()); } } reader.close(); } catch (EOFException e) { // For some reason it doesn't know when the input stream is done?? } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } /* private static class ClusterReducer extends MapReduceBase implements Reducer<IntWritable, PairOfStringInt, IntWritable, Text> { static final Text articleSentence = new Text(); WikipediaForwardIndex INDEX; @Override public void reduce(IntWritable key, Iterator<PairOfStringInt> values, OutputCollector<IntWritable, Text> output, Reporter reporter) throws IOException { while (values.hasNext()) { PairOfStringInt val = values.next(); int docid = val.getRightElement(); String sentence = val.getLeftElement(); WikipediaPage page = INDEX.getDocument(docid); //System.out.println(page.getContent()); articleSentence.set(page.getTitle() + "\t" + sentence); output.collect(key, articleSentence); } } @Override public void configure(JobConf conf){ INDEX = new WikipediaForwardIndex(conf); String indexFile = conf.get("indexfile"); String mapFile = conf.get("mapfile"); try { INDEX.loadIndex(new Path(indexFile), new Path(mapFile), FileSystem.get(conf)); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } */ //private static final String PAIRFILE = "pairfile"; private static final String CLUSTERMAP = "clustermap"; //private static final String INDEXFILE = "indexfile"; //private static final String MAPFILE = "mapfile"; private static final String INPUT = "input"; private static final String OUTPUT = "output"; private static final String NUM_REDUCERS = "numReducers"; private static final String LANGUAGE_OPTION = "wiki_language"; @SuppressWarnings("static-access") @Override public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("bz2 input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg() .withDescription("two-letter language code").create(LANGUAGE_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); //options.addOption(OptionBuilder.withArgName("path") // .hasArg().withDescription("pair file").create(PAIRFILE)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("cluster map file").create(CLUSTERMAP)); //options.addOption(OptionBuilder.withArgName("path") // .hasArg().withDescription("index file").create(INDEXFILE)); //options.addOption(OptionBuilder.withArgName("path") // .hasArg().withDescription("map file").create(MAPFILE)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(CLUSTERMAP)) { //|| !cmdline.hasOption(INDEXFILE) || !cmdline.hasOption(MAPFILE)) { HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String language = "en"; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (language.length() != 2) { System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); //String pairPath = cmdline.getOptionValue(PAIRFILE); String clusterPath = cmdline.getOptionValue(CLUSTERMAP); //String indexPath = cmdline.getOptionValue(INDEXFILE); //String mapPath = cmdline.getOptionValue(MAPFILE); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - bz2 file: " + inputPath); LOG.info(" - output file: " + outputPath); LOG.info(" - language: " + language); JobConf conf = new JobConf(getConf(), GetSentenceClusters.class); //conf.set("indexfile", indexPath); //conf.set("mapfile", mapPath); /* Get Clusters from MinhashWikipediaPages pair output */ //String docmapFile = "docmap.out"; //String remoteDocmapFile = "docmap2.out"; //getClusters(pairPath,conf,docmapFile); //System.exit(-1); //FileSystem fs = FileSystem.get(conf); //fs.copyFromLocalFile(new Path(docmapFile), new Path(remoteDocmapFile)); conf.set("docmapfile", clusterPath); conf.setJobName(String.format("GetSentenceClusters[%s: %s, %s: %s, %s: %s]", INPUT, inputPath, OUTPUT, outputPath, LANGUAGE_OPTION, language)); conf.setNumMapTasks(4); conf.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); if (language != null) { conf.set("wiki.language", language); } conf.setMapperClass(ClusterMapper.class); //conf.setReducerClass(ClusterReducer.class); //conf.setInputFormat(WikipediaPageInputFormat.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); // Set heap space - using old API conf.set("mapred.job.map.memory.mb", "2048"); conf.set("mapred.map.child.java.opts", "-Xmx2048m"); conf.set("mapred.job.reduce.memory.mb", "4096"); conf.set("mapred.reduce.child.java.opts", "-Xmx4096m"); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(PairOfStrings.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(conf).delete(outputDir, true); JobClient.runJob(conf); return 0; } public GetSentenceClusters() { } public static void main(String[] args) throws Exception { ToolRunner.run(new GetSentenceClusters(), args); } }