Java tutorial
/* * Copyright 2012 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.lt.n2n.hadoop.pipetests; import java.io.File; import java.io.IOException; import java.util.Collection; import java.util.Iterator; import jobimtext.holing.extractor.JobimAnnotationExtractor; import jobimtext.holing.extractor.JobimExtractorConfiguration; import jobimtext.holing.type.JoBim; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.mapreduce.filecache.DistributedCache; import org.apache.hadoop.util.ToolRunner; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.fit.factory.AggregateBuilder; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.lt.n2n.annotators.JoBimPrinter; import de.tudarmstadt.lt.n2n.pipelines.JoBimRelationPipeline; import de.tudarmstadt.lt.n2n.utilities.SHARED_CONSTANTS; import de.tudarmstadt.lt.utilities.types.RepeatedSentence; import de.tudarmstadt.ukp.dkpro.bigdata.hadoop.DkproHadoopDriver; import de.tudarmstadt.ukp.dkpro.bigdata.hadoop.DkproMapper; import de.tudarmstadt.ukp.dkpro.bigdata.io.hadoop.CASWritable; import de.tudarmstadt.ukp.dkpro.bigdata.io.hadoop.Text2CASInputFormat; import de.tudarmstadt.ukp.dkpro.bigdata.io.hadoop.Text2CASInputFormat.DocumentTextExtractor; /** * * @author Steffen Remus */ public class GoogleSyntacticsJob5 extends DkproHadoopDriver { static void print_usage(String message) { if (message != null && !"".equals(message)) System.out.println(message); // Usage: HadoopPipe [hadoop-params] input output [num-mappers] System.out.format( "Usage: ... %s -D%s=<extractor-configuration-file1>,<extractor-configuration-file2>,... input output [num-mappers] %n", GoogleSyntacticsJob5.class.getName(), SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS); } public static void main(String[] args) { try { ToolRunner.run(new Configuration(), new GoogleSyntacticsJob5(), args); } catch (Exception e) { e.printStackTrace(); } } @Override public AnalysisEngineDescription buildMapperEngine(Configuration conf) throws ResourceInitializationException { AggregateBuilder builder = new AggregateBuilder(); // builder.add(AnalysisEngineFactory.createEngineDescription(MetaDataAnnotator.class)); builder.add(AnalysisEngineFactory.createEngineDescription( JoBimRelationPipeline.createGoogleSyntacticsRelationEngine(true/* create_tokens */, true/* create_sentences */, true/* create_dependencies */, true/* create_new_relations */, true/* create_dependency_path */, true/*ignore_nn_relations*/, 5/* dependecy_path_maxlength */, false/* create_detailed_output */, null/* extractor_configuration */, null/* output_destination */))); return builder.createAggregateDescription(); } @Override public AnalysisEngineDescription buildReducerEngine(Configuration job) throws ResourceInitializationException { return null; } @Override public Class<?> getInputFormatClass() { return Text2CASInputFormat.class; } @Override public void configure(JobConf job) { String extractorConfigurationFiles = job.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS); if (extractorConfigurationFiles == null) { extractorConfigurationFiles = StringUtils.join(SHARED_CONSTANTS.DEFAULT_EXTRACTOR_CONFIGURATIONS, ','); System.out.format("Extractorconfigurationfile parameter not set. Assuming -D%s=%s %n", SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); job.set(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); } try { String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(","); for (int i = 0; i < extractorConfigurationFilesArr.length; i++) DistributedCache.addFileToClassPath(new Path(extractorConfigurationFilesArr[i]), job); } catch (IOException e) { e.printStackTrace(); } Text2CASInputFormat.setDocumentTextExtractorClass(job, KeyPlusValueAsDocumentExtractor.class); job.setMapperClass(JoBimMapper.class); job.setReducerClass(JoBimReducer.class); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); job.setMemoryForMapTask(4096); job.setMemoryForReduceTask(4096); job.set("mapred.child.java.opts", "-Xmx4096m"); job.setNumReduceTasks(1); // reset to default } public static class KeyPlusValueAsDocumentExtractor implements DocumentTextExtractor { private static Text _text = new Text(); @Override public Text extractDocumentText(Text key, Text value) { _text.set(key.toString() + "\t" + value.toString()); return _text; } } public static class JoBimMapper extends DkproMapper { @Override protected Text getOutputKey(Text key, CAS aCAS) { return new Text(String.valueOf(aCAS.getDocumentText().hashCode())); } } public static class JoBimReducer extends MapReduceBase implements Reducer<Writable, CASWritable, Text, NullWritable> { Log logger = LogFactory.getLog(JoBimReducer.class); JobimAnnotationExtractor[] _extractors; Text _line = new Text(); @Override public void configure(JobConf job) { try { String extractorConfigurationFiles = job.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS); String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(","); for (int i = 0; i < extractorConfigurationFilesArr.length; i++) { String extractorConfigurationFileName = new File(extractorConfigurationFilesArr[i]).getName(); for (Path p : DistributedCache.getLocalCacheFiles(job)) if (p.getName().contains(extractorConfigurationFileName)) extractorConfigurationFilesArr[i] = p.toString(); } try { _extractors = new JobimAnnotationExtractor[extractorConfigurationFilesArr.length]; for (int i = 0; i < extractorConfigurationFilesArr.length; i++) _extractors[i] = JobimExtractorConfiguration .getExtractorFromXmlFile(new File(extractorConfigurationFilesArr[i]).getName()); } catch (Exception e) { throw new ResourceInitializationException(e); } } catch (Exception e) { throw new RuntimeException(e); } } @Override public void reduce(Writable key, Iterator<CASWritable> values, OutputCollector<Text, NullWritable> output, Reporter reporter) throws IOException { while (values.hasNext()) { try { JCas aJCas = values.next().getCAS().getJCas(); Collection<RepeatedSentence> covering_annotations = JCasUtil.select(aJCas, RepeatedSentence.class); for (RepeatedSentence covering_annotation : covering_annotations) { int repetitions = covering_annotation.getRepetitionCount(); for (JoBim jb : JoBimPrinter.getJoBims(covering_annotation, false)) { for (JobimAnnotationExtractor extractor : _extractors) { _line.set(JoBimPrinter.get_concise_string_old_format(jb, covering_annotation, extractor, repetitions)); output.collect(_line, NullWritable.get()); } } } } catch (CASException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } }