Example usage for opennlp.tools.util Span getStart

Introduction

In this page you can find the example usage for opennlp.tools.util Span getStart.

Prototype

public int getStart()

Source Link

Document

Return the start of a span.

Usage

From source file:com.civprod.writerstoolbox.testarea.UnsupervisedDiscourseSegmentation.java

public static List<List<String>> segment(Document<?> inDocument, SentenceDetector inSentenceDetector,
        StringTokenizer inStringTokenizer) {
    List<String> concatenateTokens = concatenateTokens(inDocument, inSentenceDetector, inStringTokenizer);
    List<String> stemmAndFilterList = TokenUtil.stemmAndFilterList(concatenateTokens);
    List<List<String>> splitIntoFixLengthLists = splitIntoFixLengthLists(stemmAndFilterList, 20);
    List<Counter<String>> counters = splitIntoFixLengthLists.parallelStream()
            .map((List<String> curSentence) -> CounterUtils.count(curSentence)).collect(Collectors.toList());
    List<Double> cosineSimilarity = new ArrayList<>(counters.size() - 20);
    for (int i = 0; i < (counters.size() - 20); i++) {
        cosineSimilarity.add(cosineSimilarityStemmedAndFiltered(Counter.join(counters.subList(i, i + 10)),
                Counter.join(counters.subList(i + 11, i + 20))));
    }/* w w  w.  j ava2 s . c o  m*/
    List<Double> valleys = new ArrayList<>(cosineSimilarity.size() - 2);
    for (int i = 0; i < valleys.size(); i++) {
        double ya1 = cosineSimilarity.get(i);
        double ya2 = cosineSimilarity.get(i + 1);
        double ya3 = cosineSimilarity.get(i + 2);
        valleys.add((ya1 - ya2) + (ya3 - ya2));
    }
    SummaryStatistics valleyStatistics = valleys.parallelStream().collect(SummaryStatisticCollector.instance);
    double cutoffThreshold = valleyStatistics.getMean() - valleyStatistics.getStandardDeviation();
    int lastLocation = 0;
    List<Span> spans = new ArrayList<>(1);
    for (int i = 0; i < valleys.size(); i++) {
        double curValley = valleys.get(i);
        if (curValley < cutoffThreshold) {
            int curLocation = (i + 11) * 20;
            spans.add(new Span(lastLocation, curLocation));
            lastLocation = curLocation;
        }
    }
    spans.add(new Span(lastLocation, concatenateTokens.size()));
    return spans.parallelStream()
            .map((Span curSpan) -> concatenateTokens.subList(curSpan.getStart(), curSpan.getEnd()))
            .collect(Collectors.toList());
}

From source file:edu.stanford.muse.index.NER.java

public static void testOpenNLP() {

    try {/*  w  w w  . j a  va 2s.  c  o  m*/
        String s = Util.readFile("/tmp/in");
        /*
        List<Pair<String,Float>> pairs = NER.namesFromText(s);
        for (Pair<String,Float> p: pairs) {
           System.out.println (p);
        }
        System.out.println ("-----");
        */

        InputStream pis = Config.getResourceAsStream("en-ner-person.bin");
        TokenNameFinderModel pmodel = new TokenNameFinderModel(pis);
        InputStream lis = Config.getResourceAsStream("en-ner-location.bin");
        TokenNameFinderModel lmodel = new TokenNameFinderModel(lis);
        InputStream ois = Config.getResourceAsStream("en-ner-organization.bin");
        TokenNameFinderModel omodel = new TokenNameFinderModel(ois);
        InputStream tokenStream = Config.getResourceAsStream("en-token.bin");
        TokenizerModel modelTokenizer = new TokenizerModel(tokenStream);
        TokenizerME tokenizer = new TokenizerME(modelTokenizer);
        Span[] tokSpans = tokenizer.tokenizePos(s); // Util.tokenize(s).toArray(new String[0]);

        String tokens[] = new String[tokSpans.length];
        for (int i = 0; i < tokSpans.length; i++)
            tokens[i] = s.substring(tokSpans[i].getStart(), tokSpans[i].getEnd());

        NameFinderME pFinder = new NameFinderME(pmodel);
        Span[] pSpans = pFinder.find(tokens);
        NameFinderME lFinder = new NameFinderME(lmodel);
        Span[] lSpans = lFinder.find(tokens);
        NameFinderME oFinder = new NameFinderME(omodel);
        Span[] oSpans = oFinder.find(tokens);
        System.out.println("Names found:");
        for (Span span : pSpans) {
            for (int i = span.getStart(); i < span.getEnd(); i++)
                System.out.print(tokens[i] + " ");
            System.out.println();
        }

        System.out.println("Locations found:");
        for (Span span : lSpans) {
            for (int i = span.getStart(); i < span.getEnd(); i++)
                System.out.print(tokens[i] + " ");
            System.out.println();
        }

        System.out.println("Orgs found:");
        for (Span span : oSpans) {
            for (int i = span.getStart(); i < span.getEnd(); i++)
                System.out.print(tokens[i] + " ");
            System.out.println();
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:edu.stanford.muse.index.NER.java

/**
 * triple is a set of <entity, start char offset (inclusive), end char offset (not inclusive).
 * see http://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/ie/AbstractSequenceClassifier.html#classifyToCharacterOffsets(java.lang.String)
 *///from  www .  ja  va  2 s  .c  om
private synchronized static Pair<MyTokenizer, List<Triple<String, Integer, Integer>>> parseAndGetOffsets(
        String documentText) {
    try {
        NER.initialize();
    } catch (Exception e) {
        Util.print_exception(e, log);
    }

    if (documentText.indexOf("\u00A0") > 0)
        documentText = documentText.replaceAll("\\xA0", " "); // 0xA0 is seen often and generates a lot of annoying messages.
    // replace i18n chars with space, causes annoying NER messages + perhaps slows down NER?
    if (REMOVE_I18N_CHARS)
        documentText = cleanI18NChars(documentText);

    List<Pair<String, String>> namedEntities = new ArrayList<Pair<String, String>>(); // token-type pairs
    List<Triple<String, Integer, Integer>> allTriples = new ArrayList<Triple<String, Integer, Integer>>(); // string, start-end pairs

    Span sentenceSpans[] = sFinder.sentPosDetect(documentText); // do NER sentence by sentence -- much faster than doing the entire documentText at once

    for (Span sentenceSpan : sentenceSpans) {
        int sentenceStartOffset = sentenceSpan.getStart();
        String sentence = sentenceSpan.getCoveredText(documentText).toString();
        if (sentence.length() > 2000)
            continue; // that's not a reasonable sentence, could be a uuencoded-something.

        // convert sentence to tokens cos that's what the name finders need
        Span[] tokSpans = tokenizer.tokenizePos(sentence);
        String tokens[] = new String[tokSpans.length];
        for (int i = 0; i < tokSpans.length; i++)
            tokens[i] = tokSpans[i].getCoveredText(sentence).toString();

        // find the actual spans (in terms of tokens) that represent names
        Span[] pSpans = pFinder.find(tokens);
        Span[] lSpans = lFinder.find(tokens);
        Span[] oSpans = oFinder.find(tokens);
        List<Triple<String, Integer, Integer>> sentenceTriples = new ArrayList<Triple<String, Integer, Integer>>(); // string, start-end pairs

        for (Span span : pSpans)
            sentenceTriples.add(new Triple<String, Integer, Integer>("PERSON", span.getStart(), span.getEnd()));
        for (Span span : lSpans)
            sentenceTriples
                    .add(new Triple<String, Integer, Integer>("LOCATION", span.getStart(), span.getEnd()));
        for (Span span : oSpans)
            sentenceTriples
                    .add(new Triple<String, Integer, Integer>("ORGANIZATION", span.getStart(), span.getEnd()));

        for (Triple<String, Integer, Integer> t : sentenceTriples) {
            String type = t.first();
            if (type == null)
                type = "UNKNOWN"; // we see type = null sometimes #!@#$
            allTypes.add(type);
            int startTok = t.second();
            int endTok = t.third();

            String namedEntity = sentence.substring(tokSpans[startTok].getStart(),
                    tokSpans[endTok - 1].getEnd());
            // we tend to see a lot of annoying [Hi Sam] or [Dear Caroline] phrases. surprising NER can't handle it already.

            if (namedEntity.toLowerCase().startsWith("hi "))
                namedEntity = namedEntity.substring("hi ".length()).trim();
            if (namedEntity.toLowerCase().startsWith("hello "))
                namedEntity = namedEntity.substring("hello ".length()).trim();
            if (namedEntity.toLowerCase().startsWith("dear "))
                namedEntity = namedEntity.substring("dear ".length()).trim();
            if (namedEntity.toLowerCase().startsWith("cheers "))
                namedEntity = namedEntity.substring("cheers ".length()).trim();
            if (namedEntity.toLowerCase().startsWith("thanks "))
                namedEntity = namedEntity.substring("thanks ".length()).trim();

            if (DictUtils.tabooNames.contains(namedEntity.toLowerCase()))
                continue;
            if (!nameFilterPass(namedEntity))
                continue;

            if (namedEntity.length() < MIN_NAME_LENGTH || namedEntity.length() > MAX_NAME_LENGTH) // drop it
                continue;
            namedEntities.add(new Pair<String, String>(namedEntity, type));
            if (log.isDebugEnabled())
                log.debug(t.first() + " : [" + t.second() + ":" + t.third() + "] " + namedEntity);
        }

        // sentence triple offsets cannot be used directly ... have to be first converted to the right offset within the entire document by adding sentenceStartOffset
        for (Triple<String, Integer, Integer> t : sentenceTriples) {
            int startTok = t.second();
            int endTok = t.third();
            int start = tokSpans[startTok].getStart(), end = tokSpans[endTok - 1].getEnd();

            //allTriples.add(new Triple<String, Integer, Integer>(t.getFirst(), sentenceStartOffset + t.getSecond(), sentenceStartOffset + t.getThird()));
            allTriples.add(new Triple<String, Integer, Integer>(t.getFirst(), sentenceStartOffset + start,
                    sentenceStartOffset + end));
        }
    }

    return new Pair<MyTokenizer, List<Triple<String, Integer, Integer>>>(new NERTokenizer(namedEntities),
            allTriples);
}

From source file:com.civis.utils.opennlp.models.address.AddressSpanBuilder.java

private String buildString(Span span, String[] tokens) {
    return buildString(span.getStart(), span.getEnd(), tokens);
}

From source file:de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpSegmenter.java

@Override
protected void process(JCas aJCas, String aText, int aZoneBegin) throws AnalysisEngineProcessException {
    for (Span sSpan : sentenceModelProvider.getResource().sentPosDetect(aText)) {
        createSentence(aJCas, sSpan.getStart() + aZoneBegin, sSpan.getEnd() + aZoneBegin);
        for (Span tSpan : tokenModelProvider.getResource()
                .tokenizePos(aText.substring(sSpan.getStart(), sSpan.getEnd()))) {
            createToken(aJCas, tSpan.getStart() + sSpan.getStart() + aZoneBegin,
                    tSpan.getEnd() + sSpan.getStart() + aZoneBegin);
        }//from   w w w  .  ja  v  a 2s .  com
    }
}

From source file:opennlp.tools.util.Span.java

  /**
 * Returns true if the specified span is contained by this span.  
 * Identical spans are considered to contain each other. 
 * //ww  w  .j  av a 2s  . c om
 * @param s The span to compare with this span.
 * 
 * @return true is the specified span is contained by this span; 
 * false otherwise.
 */
public boolean contains(Span s) {
  return start <= s.getStart() && s.getEnd() <= end;
}

From source file:opennlp.tools.util.Span.java

  /**
 * Returns true if the specified span intersects with this span.
 * //from w w w  .j av  a2  s .  c om
 * @param s The span to compare with this span. 
 * 
 * @return true is the spans overlap; false otherwise. 
 */
public boolean intersects(Span s) {
  int sstart = s.getStart();
  //either s's start is in this or this' start is in s
  return this.contains(s) || s.contains(this) || 
    getStart() <= sstart && sstart < getEnd() ||
    sstart <= getStart() && getStart() < s.getEnd();
}

From source file:opennlp.tools.util.Span.java

  /**
 * Returns true is the specified span crosses this span.
 * //from   w  w  w  .  java  2 s . co  m
 * @param s The span to compare with this span.
 * 
 * @return true is the specified span overlaps this span and contains a 
 * non-overlapping section; false otherwise.
 */
public boolean crosses(Span s) {
  int sstart = s.getStart();
  //either s's start is in this or this' start is in s
  return !this.contains(s) && !s.contains(this) && 
    (getStart() <= sstart && sstart < getEnd() ||
    sstart <= getStart() && getStart() < s.getEnd());
}

From source file:opennlp.tools.util.Span.java

  /**
 * Returns true if the specified span is the begin of this span and the
 * specified span is contained in this span.
 * // w  w  w  .  j  a  v  a  2s  .  c  om
 * @param s The span to compare with this span.
 * 
 * @return true if the specified span starts with this span and is
 * contained in this span; false otherwise
 */
public boolean startsWith(Span s) {
  return getStart() == s.getStart() && contains(s);
}

From source file:opennlp.tools.util.Span.java

  /**
 * Compares the specified span to the current span.
 *//*from   w  w w  .  j av  a2  s.c o m*/
public int compareTo(Object o) { 
  Span s = (Span) o;
  if (getStart() < s.getStart()) {
    return -1;
  }
  else if (getStart() == s.getStart()) {
    if (getEnd() > s.getEnd()) {
      return -1;
    }
    else if (getEnd() < s.getEnd()) {
      return 1;
    }
    else {
      return 0;
    }
  }
  else {
    return 1;
  }
}