Java tutorial
/******************************************************************************* * Copyright 2012 G.G Gebremeskel * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package nl.cwi.wikilink.apps; import java.io.BufferedReader; import java.io.DataInputStream; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.net.URI; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.regex.Pattern; import nl.cwi.json2013.topics.Filter_topics; import nl.cwi.json2013.topics.Targets; import nl.cwi.wikilink.thrift.Mention; import nl.cwi.wikilink.thrift.WikiLinkItem; import nl.cwi.wikilink.thrift.bin.ThriftFileInputFormat; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Counters; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; /** * This program tries to read a wikilink thrift file. * * @author G.G Gebremeskel */ public class WikiLinkContextExtractor extends Configured implements Tool { public static final String RUNTAG = "wikilink"; public static final String QUERYFILEPATH_HDFS = "kba.topicfilelocation"; protected enum Counter { documents } private static final Logger LOG = Logger.getLogger(WikiLinkContextExtractor.class); public static class MyMapper extends Mapper<Text, WikiLinkItem, Text, Text> { private Map<String, Pattern> topicregexes = new LinkedHashMap<String, Pattern>(); private Filter_topics ft = null; @Override public void setup(Context context) throws IOException, InterruptedException { super.setup(context); loadTopics(QUERYFILEPATH_HDFS); System.out.println("?? " + topicregexes.keySet().size()); for (String k : topicregexes.keySet()) { System.out.println(k); } } protected void cleanup(Context context) throws IOException, InterruptedException { super.cleanup(context); } @Override public void map(Text key, WikiLinkItem value, Context context) throws IOException, InterruptedException { context.getCounter(Counter.documents).increment(1); Iterator it = value.getMentionsIterator(); //String url = value.getUrl(); /*System.out.println(url + " mentions the following \n"); for (String k: topicregexes.keySet()){ System.out.println( k); } */ String filename = key.toString(); // calculate the score as the relative frequency of occurring of the // entity in the document. // count = 1000 * (count * topic.length()) / body.length(); //context.write(new Text( "========================================================"), new Text("")); //context.write(new Text(url+ ":"), new Text("")); //String collect =""; while (it.hasNext()) { Mention m = (Mention) it.next(); for (String k : topicregexes.keySet()) { String[] path_split = m.getWiki_url().toString().split("/"); String pageName = path_split[path_split.length - 1]; if (pageName.equalsIgnoreCase(k)) { //context.write(new Text(k), new Text(m.toString())); nl.cwi.wikilink.thrift.Context con = m.getContext(); if (con != null) { context.write( new Text( k + "\t" + con.getLeft().toString() + "\t" + con.getRight().toString()), new Text(m.getAnchor_text().toString())); } } } } } private void loadTopics(String xcontext) { DataInputStream in = null; try { in = new DataInputStream(new FileInputStream(xcontext)); BufferedReader br = new BufferedReader(new InputStreamReader(in)); ft = new Filter_topics.Factory().loadTopics(br); LOG.info(ft.getTopic_set_id()); Targets[] t = ft.getTargets(); Iterator it = Arrays.asList(t).iterator(); // for (Target t : Arrays.asList(t))) { while (it.hasNext()) { Targets targ = (Targets) it.next(); Pattern p; // add the full name p = Pattern.compile(".*\\b+" + targ.target_id.replaceAll("_", " ") + "\\b+.*", Pattern.CASE_INSENSITIVE); if (!targ.target_id.startsWith("https://twitter.com/")) { topicregexes.put(targ.target_id.split("/")[4], p); System.out.print(targ.target_id); } // add the individual terms // HashSet<String> pset = new HashSet<String>(); // pset.addAll(new HashSet<String>(Arrays // .asList(targ.target_id.split("_")))); // pset.add(targ.target_id.replaceAll("_", " ")); // partialtopics.put(targ.target_id, pset); } } catch (IOException e) { e.printStackTrace(); LOG.error("read from distributed cache: read instances"); } catch (Exception e) { e.printStackTrace(); LOG.error("read from distributed cache: " + e); } finally { if (in != null) { try { in.close(); } catch (IOException e1) { e1.printStackTrace(); } } } } } /** * Loads the JSON topic file. * * @param context */ public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new WikiLinkContextExtractor(), args); System.exit(res); } static int printUsage() { System.out.println("Usage: " + WikiLinkContextExtractor.class.getName() + " -i input -o output \n\n" + "Example usage: hadoop jar TRECKBA_fat_new.jar nl.cwi.wikilink.apps.WikiLinkContextToy " + WikiLinkContextExtractor.class.getName() + " " + "-i KBA/Data/wikilinks_trial/* " + "-o KBA/OutPut/wikilinks " + "-q KBA/Data/trec-kba-ccr-2012.filter-topics.json "); ToolRunner.printGenericCommandUsage(System.out); return -1; } @Override public int run(String[] args) throws Exception { String in = null; String out = null; String queryFile = null; List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-q".equals(args[i])) { queryFile = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null) return printUsage(); LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); LOG.info(" - query file: " + queryFile); Configuration conf = getConf(); conf.set(QUERYFILEPATH_HDFS, new Path(queryFile).toUri().toString()); FileSystem fs = FileSystem.get(conf); // Lookup required data from the topic file // loadTopicData(queryfile, fr, fs, run_info); Job job = new Job(conf, "WikiLinks"); job.setJarByClass(WikiLinkContextExtractor.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(queryFile) + "#" + QUERYFILEPATH_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); job.setInputFormatClass(ThriftFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // job.setCombinerClass(MyReducer.class); // job.setReducerClass(MyReducer.class); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; // add some more statistics Counters c = job.getCounters(); return status; } }