Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.ctakes.temporal.duration; import java.io.File; import java.io.FilenameFilter; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.time.temporal.TemporalField; import java.time.temporal.TemporalUnit; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import org.apache.ctakes.core.cr.XMIReader; import org.apache.ctakes.core.resource.FileLocator; import org.apache.ctakes.temporal.ae.feature.duration.DurationEventTimeFeatureExtractor; import org.apache.ctakes.typesystem.type.syntax.BaseToken; import org.apache.ctakes.typesystem.type.textsem.EventMention; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CASException; import org.apache.uima.collection.CollectionReader; import org.apache.uima.fit.factory.CollectionReaderFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.cleartk.ml.Feature; import com.google.common.base.Charsets; import com.google.common.base.Joiner; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Multiset; import com.google.common.io.Files; import com.google.common.io.LineProcessor; import com.googlecode.clearnlp.engine.EngineGetter; import com.googlecode.clearnlp.morphology.AbstractMPAnalyzer; import com.googlecode.clearnlp.reader.AbstractReader; import info.bethard.timenorm.Period; import info.bethard.timenorm.PeriodSet; import info.bethard.timenorm.Temporal; import info.bethard.timenorm.TemporalExpressionParser; import info.bethard.timenorm.TimeSpan; import info.bethard.timenorm.TimeSpanSet; import scala.collection.immutable.Set; import scala.util.Try; import info.bethard.timenorm.DefaultTokenizer$; /** * Various useful classes and methods for evaluating event duration data. */ public class Utils { // events and their duration distributions public static final String durationDistributionPath = "/Users/dima/Boston/Thyme/Duration/Data/Combined/Distribution/all.txt"; // eight bins over which we define a duration distribution public static final String[] bins = { "second", "minute", "hour", "day", "week", "month", "year", "decade" }; /** * Extract time unit(s) from a temporal expression * and put in one of the eight bins above. * Return empty set if time units could not be extracted. * E.g. July 5, 1984 -> day */ public static HashSet<String> getTimeUnits(String timex) { HashSet<String> timeUnits = new HashSet<>(); Set<TemporalUnit> units = runTimexParser(timex.toLowerCase()); if (units == null) { return timeUnits; } scala.collection.Iterator<TemporalUnit> iterator = units.iterator(); while (iterator.hasNext()) { TemporalUnit unit = iterator.next(); String bin = putInBin(unit.toString()); if (bin != null) { timeUnits.add(bin); } } return timeUnits; } /** * Use Bethard normalizer to map a temporal expression to a time unit. */ public static Set<TemporalUnit> runTimexParser(String timex) { URL grammarURL = DurationEventTimeFeatureExtractor.class.getResource("/info/bethard/timenorm/en.grammar"); TemporalExpressionParser parser = new TemporalExpressionParser(grammarURL, DefaultTokenizer$.MODULE$); TimeSpan anchor = TimeSpan.of(2013, 12, 16); Try<Temporal> result = parser.parse(timex, anchor); Set<TemporalUnit> units = null; if (result.isSuccess()) { Temporal temporal = result.get(); if (temporal instanceof Period) { units = ((Period) temporal).unitAmounts().keySet(); } else if (temporal instanceof PeriodSet) { units = ((PeriodSet) temporal).period().unitAmounts().keySet(); } else if (temporal instanceof TimeSpan) { units = ((TimeSpan) temporal).period().unitAmounts().keySet(); } else if (temporal instanceof TimeSpanSet) { Set<TemporalField> fields = ((TimeSpanSet) temporal).fields().keySet(); units = null; // fill units by calling .getBaseUnit() on each field } } return units; } /** * Use Bethard normalizer to get TimeML value. */ public static String getTimexMLValue(String timex) { URL grammarURL = DurationEventTimeFeatureExtractor.class.getResource("/info/bethard/timenorm/en.grammar"); TemporalExpressionParser parser = new TemporalExpressionParser(grammarURL, DefaultTokenizer$.MODULE$); TimeSpan anchor = TimeSpan.of(2013, 12, 16); Try<Temporal> result = parser.parse(timex, anchor); String value = null; if (result.isSuccess()) { Temporal temporal = result.get(); value = temporal.timeMLValue(); } return value; } /** * Use Bethard normalizer to get TimeML value. */ public static String getTimexMLValue(String timex, String anchorStr) { String anchstr = getTimexMLValue(anchorStr); URL grammarURL = DurationEventTimeFeatureExtractor.class.getResource("/info/bethard/timenorm/en.grammar"); TemporalExpressionParser parser = new TemporalExpressionParser(grammarURL, DefaultTokenizer$.MODULE$); TimeSpan anchor = TimeSpan.fromTimeMLValue(anchstr);//.of(2013, 12, 16); Try<Temporal> result = parser.parse(timex, anchor); String value = null; if (result.isSuccess()) { Temporal temporal = result.get(); value = temporal.timeMLValue(); } return value; } /** * Take the time unit from Bethard noramlizer * and return a coarser time unit, i.e. one of the eight bins. * Return null, if this cannot be done. */ public static String putInBin(String timeUnit) { HashSet<String> allowableTimeUnits = new HashSet<>(Arrays.asList(bins)); // e.g. Years -> year String singularAndLowercased = timeUnit.substring(0, timeUnit.length() - 1).toLowerCase(); // is this one of the bins already? if (allowableTimeUnits.contains(singularAndLowercased)) { return singularAndLowercased; } // units that Betard normalizer outputs mapped to one of the eight bins Map<String, String> mapping = ImmutableMap.<String, String>builder().put("afternoon", "hour") .put("evening", "hour").put("morning", "hour").put("night", "hour").put("fall", "month") .put("winter", "month").put("spring", "month").put("summer", "month").put("quarteryear", "month") .build(); // it's not one of the bins; can we map to to a bin? if (mapping.get(singularAndLowercased) != null) { return mapping.get(singularAndLowercased); } // we couldn't map it to a bin return null; } /** * Compute expected duration in seconds. Normalize by number of seconds in a decade. */ public static float expectedDuration(Map<String, Float> distribution) { // unit of time -> duration in seconds final Map<String, Integer> timeUnitInSeconds = ImmutableMap.<String, Integer>builder().put("second", 1) .put("minute", 60).put("hour", 60 * 60).put("day", 60 * 60 * 24).put("week", 60 * 60 * 24 * 7) .put("month", 60 * 60 * 24 * 30).put("year", 60 * 60 * 24 * 365) .put("decade", 60 * 60 * 24 * 365 * 10).build(); float expectation = 0f; for (String unit : distribution.keySet()) { expectation = expectation + (timeUnitInSeconds.get(unit) * distribution.get(unit)); } return expectation / timeUnitInSeconds.get("decade"); } /** * Take a time unit and return a probability distribution * in which p(this time unit) = 1 and all others are zero. * Assume time unit is one of the eight duration bins. */ public static Map<String, Float> convertToDistribution(String timeUnit) { Map<String, Float> distribution = new HashMap<String, Float>(); for (String bin : bins) { if (bin.equals(timeUnit)) { distribution.put(bin, 1.0f); } else { distribution.put(bin, 0.0f); } } return distribution; } /** * Convert duration distribution multiset to a format that's easy to parse automatically. * Format: <sign/symptom>, <time bin>:<count>, ... * Example: apnea, second:5, minute:1, hour:5, day:10, week:1, month:0, year:0 */ public static String formatDistribution(String mentionText, Multiset<String> durationDistribution, String separator, boolean normalize) { List<String> distribution = new LinkedList<String>(); distribution.add(mentionText); double total = 0; if (normalize) { for (String bin : bins) { total += durationDistribution.count(bin); } } for (String bin : bins) { if (normalize) { distribution.add(String.format("%s:%.3f", bin, durationDistribution.count(bin) / total)); } else { distribution.add(String.format("%s:%d", bin, durationDistribution.count(bin))); } } Joiner joiner = Joiner.on(separator); return joiner.join(distribution); } /** * Get relation context. */ public static String getTextBetweenAnnotations(JCas jCas, Annotation arg1, Annotation arg2) { final int windowSize = 5; String text = jCas.getDocumentText(); int leftArgBegin = Math.min(arg1.getBegin(), arg2.getBegin()); int rightArgEnd = Math.max(arg1.getEnd(), arg2.getEnd()); int begin = Math.max(0, leftArgBegin - windowSize); int end = Math.min(text.length(), rightArgEnd + windowSize); return text.substring(begin, end).replaceAll("[\r\n]", " "); } /** * Lemmatize word using ClearNLP lemmatizer. */ public static String lemmatize(String word, String pos) throws IOException { final String ENG_LEMMATIZER_DATA_FILE = "org/apache/ctakes/dependency/parser/models/lemmatizer/dictionary-1.3.1.jar"; AbstractMPAnalyzer lemmatizer; InputStream lemmatizerModel = FileLocator.getAsStream(ENG_LEMMATIZER_DATA_FILE); lemmatizer = EngineGetter.getMPAnalyzer(AbstractReader.LANG_EN, lemmatizerModel); String lemma = lemmatizer.getLemma(word, pos); lemmatizerModel.close(); return lemma; } /** * Return system generated POS tag or null if none available. */ public static String getPosTag(JCas systemView, Annotation annotation) { List<BaseToken> coveringBaseTokens = JCasUtil.selectCovered(systemView, BaseToken.class, annotation.getBegin(), annotation.getEnd()); if (coveringBaseTokens.size() < 1) { return null; } return coveringBaseTokens.get(0).getPartOfSpeech(); } /** * Keep UMLS concepts and non-verbs intact. Lemmatize verbs. * Lowercase before returning. */ public static String normalizeEventText(JCas jCas, Annotation annotation) throws AnalysisEngineProcessException { JCas systemView; try { systemView = jCas.getView("_InitialView"); } catch (CASException e) { throw new AnalysisEngineProcessException(e); } List<EventMention> coveringSystemEventMentions = JCasUtil.selectCovered(systemView, EventMention.class, annotation.getBegin(), annotation.getEnd()); for (EventMention systemEventMention : coveringSystemEventMentions) { if (systemEventMention.getTypeID() != 0) { return annotation.getCoveredText().toLowerCase(); } } String pos = Utils.getPosTag(systemView, annotation); if (pos == null) { return annotation.getCoveredText().toLowerCase(); } String text; if (pos.startsWith("V")) { try { text = Utils.lemmatize(annotation.getCoveredText().toLowerCase(), pos); } catch (IOException e) { System.out.println("couldn't lemmatize: " + annotation.getCoveredText()); e.printStackTrace(); return annotation.getCoveredText().toLowerCase(); } } else { text = annotation.getCoveredText(); } return text.toLowerCase(); } /** * Read event duration distributions from file. */ public static class Callback implements LineProcessor<Map<String, Map<String, Float>>> { // map event text to its duration distribution private Map<String, Map<String, Float>> textToDistribution; public Callback() { textToDistribution = new HashMap<String, Map<String, Float>>(); } public boolean processLine(String line) throws IOException { String[] elements = line.split(", "); // e.g. pain, second:0.000, minute:0.005, hour:0.099, ... Map<String, Float> distribution = new HashMap<String, Float>(); for (int durationBinNumber = 1; durationBinNumber < elements.length; durationBinNumber++) { String[] durationAndValue = elements[durationBinNumber].split(":"); // e.g. "day:0.475" distribution.put(durationAndValue[0], Float.parseFloat(durationAndValue[1])); } textToDistribution.put(elements[0], distribution); return true; } public Map<String, Map<String, Float>> getResult() { return textToDistribution; } } /** * Instantiate an XMI collection reader. */ public static CollectionReader getCollectionReader(List<File> inputFiles) throws Exception { List<String> fileNames = new ArrayList<>(); for (File file : inputFiles) { if (!(file.isHidden())) { fileNames.add(file.getPath()); } } String[] paths = new String[fileNames.size()]; fileNames.toArray(paths); return CollectionReaderFactory.createReader(XMIReader.class, XMIReader.PARAM_FILES, paths); } /** * Get files for specific sets of patients. * Useful for selecting e.g. only training files. */ public static List<File> getFilesFor(List<Integer> patientSets, File inputDirectory) { List<File> files = new ArrayList<>(); for (Integer set : patientSets) { final int setNum = set; for (File file : inputDirectory.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { return name.contains(String.format("ID%03d", setNum)); } })) { // skip hidden files like .svn if (!file.isHidden()) { files.add(file); } } } return files; } /** * Output label and list of cleartk features to a file for debugging. */ public static void writeInstance(String label, List<Feature> features, String fileName) { StringBuffer output = new StringBuffer(label); for (Feature feature : features) { if (feature.getName() == null || feature.getValue() == null) { continue; } String name = feature.getName(); Object value = feature.getValue(); String nameValuePair; if (value instanceof String) { String cleanedUpName = name.replace(",", "COMMA").replace(":", "COLON").replace("\n", "EOL"); String cleanedUpValue = value.toString().replace(",", "COMMA").replace(":", "COLON").replace("\n", "EOL"); nameValuePair = String.format(",%s-%s:%s", cleanedUpName, cleanedUpValue, 1); } else if (value instanceof Integer) { String cleanedUpName = name.replace(",", "COMMA").replace(":", "COLON").replace("\n", "EOL"); String cleanedUpValue = value.toString().replace(",", "COMMA").replace(":", "COLON").replace("\n", "EOL"); nameValuePair = String.format(",%s:%s", cleanedUpName, cleanedUpValue); } else { continue; } output.append(nameValuePair); } try { Files.append(output + "\n", new File(fileName), Charsets.UTF_8); } catch (IOException e) { System.err.println("could not write to output file!"); } } public static void main(String[] args) { HashSet<String> timeUnits = getTimeUnits("three months"); System.out.println(timeUnits); } }