Example usage for org.apache.lucene.analysis.miscellaneous PerFieldAnalyzerWrapper tokenStream

List of usage examples for org.apache.lucene.analysis.miscellaneous PerFieldAnalyzerWrapper tokenStream

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.miscellaneous PerFieldAnalyzerWrapper tokenStream.

Prototype

public final TokenStream tokenStream(final String fieldName, final Reader reader) 

Source Link

Document

Returns a TokenStream suitable for fieldName, tokenizing the contents of reader.

Usage

From source file:com.github.mosuka.apache.lucene.example.utils.LuceneExampleUtilTest.java

License:Apache License

public void testCreateAnalyzerWrapper() throws IOException {
    PerFieldAnalyzerWrapper wrapper = LuceneExampleUtil.createAnalyzerWrapper();

    TokenStream tokenStream = null;/*w ww.j ava 2 s .c  om*/
    CharTermAttribute charTermAttribute = null;

    List<String> expectedIdTermList = new LinkedList<String>(Arrays.asList("1"));
    List<String> actualIdTermList = new LinkedList<String>();
    tokenStream = wrapper.tokenStream("id", "1");
    charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        actualIdTermList.add(charTermAttribute.toString());
    }
    tokenStream.close();
    assertEquals(expectedIdTermList, actualIdTermList);

    List<String> expectedTextTermList = new LinkedList<String>(
            Arrays.asList("lucene", "is", "a", "full", "text", "search", "library"));
    List<String> actualTextTermList = new LinkedList<String>();
    tokenStream = wrapper.tokenStream("text", "Lucene is a Full-text search library.");
    charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        actualTextTermList.add(charTermAttribute.toString());
    }
    tokenStream.close();
    assertEquals(expectedTextTermList, actualTextTermList);
}

From source file:nicta.com.au.patent.pac.analysis.FieldsCosineSimilarities.java

public final Map<String, Double>[] parse(PatentDocument pt) throws IOException, Exception {
    Map<String, Double>[] out = new Map[5];
    String[] ptFields = new String[5];
    String title = "";
    String ipc = "";
    String abstrac = "";
    String description = "";
    String claims = "";
    for (InventionTitle inventionTitle : pt.getTechnicalData().getInventionTitle()) {
        if (inventionTitle.getLang().toLowerCase().equals("en")) {
            title = inventionTitle.getContent();
        }//from ww w.  j  ava2s.com
    }
    Map<String, Double> m1 = new HashMap<>();
    for (ClassificationIpcr ipcCode : pt.getTechnicalData().getClassificationIpcr()) {
        StringTokenizer st = new StringTokenizer(ipcCode.getContent());
        m1.put(st.nextToken(), 1.0);
    }

    if (pt.getAbstrac().getLang() != null && pt.getAbstrac().getLang().toLowerCase().equals("en")) {
        abstrac = pt.getAbstrac().getContent();
    }
    if (pt.getDescription() != null && pt.getDescription().getLang().toLowerCase().equals("en")) {
        for (P p : pt.getDescription().getP()) {
            description += p.getContent() + " ";
        }
    }
    for (Claims cs : pt.getClaims()) {
        if (cs.getLang().toLowerCase().equals("en")) {
            for (Claim claim : cs.getClaim()) {
                claims += claim.getClaimText() + " ";
            }
        }
    }
    ptFields[0] = title;
    ptFields[1] = ipc;
    ptFields[2] = abstrac;
    ptFields[3] = description;
    ptFields[4] = claims;
    Map<String, Analyzer> analyzerPerField = new HashMap<>();

    if (specificStopWords == true) {
        analyzerPerField.put(PatentDocument.Title,
                new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.TITLE_ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Abstract,
                new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ABSTRACT_ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Description,
                new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.DESCRIPTION_ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Claims,
                new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.CLAIMS_ENGLISH_STOP_WORDS_SET));
    } else {
        analyzerPerField.put(PatentDocument.Title,
                new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Abstract,
                new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Description,
                new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Claims,
                new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ENGLISH_STOP_WORDS_SET));

    }

    PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.LUCENE_44),
            analyzerPerField);
    Map<String, Double> m0 = getVector(analyzer.tokenStream(PatentDocument.Title, ptFields[0]),
            PatentDocument.Title);
    Map<String, Double> m2 = getVector(analyzer.tokenStream(PatentDocument.Abstract, ptFields[2]),
            PatentDocument.Abstract);
    Map<String, Double> m3 = getVector(analyzer.tokenStream(PatentDocument.Description, ptFields[3]),
            PatentDocument.Description);
    Map<String, Double> m4 = getVector(analyzer.tokenStream(PatentDocument.Claims, ptFields[4]),
            PatentDocument.Claims);
    out[0] = m0;
    out[1] = m1;
    out[2] = m2;
    out[3] = m3;
    out[4] = m4;
    return out;
}

From source file:nicta.com.au.patent.pac.analysis.FieldsJaccardSimilarities.java

public final Map<String, Integer>[] parse(PatentDocument pt) throws IOException {
    Map<String, Integer>[] out = new Map[5];
    String[] ptFields = new String[5];
    String title = "";
    String ipc = "";
    String abstrac = "";
    String description = "";
    String claims = "";
    for (InventionTitle inventionTitle : pt.getTechnicalData().getInventionTitle()) {
        if (inventionTitle.getLang().toLowerCase().equals("en")) {
            title = inventionTitle.getContent();
        }/*w w  w .j a  v  a2 s. c  om*/
    }
    Map<String, Integer> m1 = new HashMap<>();
    for (ClassificationIpcr ipcCode : pt.getTechnicalData().getClassificationIpcr()) {
        StringTokenizer st = new StringTokenizer(ipcCode.getContent());
        m1.put(st.nextToken(), 1);
    }

    if (pt.getAbstrac().getLang() != null && pt.getAbstrac().getLang().toLowerCase().equals("en")) {
        abstrac = pt.getAbstrac().getContent();
    }
    if (pt.getDescription() != null && pt.getDescription().getLang().toLowerCase().equals("en")) {
        for (P p : pt.getDescription().getP()) {
            description += p.getContent() + " ";
        }
    }
    for (Claims cs : pt.getClaims()) {
        if (cs.getLang().toLowerCase().equals("en")) {
            for (Claim claim : cs.getClaim()) {
                claims += claim.getClaimText() + " ";
            }
        }
    }
    ptFields[0] = title;
    ptFields[1] = ipc;
    ptFields[2] = abstrac;
    ptFields[3] = description;
    ptFields[4] = claims;
    Map<String, Analyzer> analyzerPerField = new HashMap<>();
    if (specificStopWords == true) {
        analyzerPerField.put(PatentDocument.Title,
                new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.TITLE_ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Abstract,
                new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ABSTRACT_ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Description,
                new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.DESCRIPTION_ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Claims,
                new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.CLAIMS_ENGLISH_STOP_WORDS_SET));
    } else {
        analyzerPerField.put(PatentDocument.Title,
                new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Abstract,
                new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Description,
                new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Claims,
                new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET));

    }
    PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.LUCENE_48),
            analyzerPerField);
    Map<String, Integer> m0 = transformation(analyzer.tokenStream(PatentDocument.Title, ptFields[0]));
    Map<String, Integer> m2 = transformation(analyzer.tokenStream(PatentDocument.Abstract, ptFields[2]));
    Map<String, Integer> m3 = transformation(analyzer.tokenStream(PatentDocument.Description, ptFields[3]));
    Map<String, Integer> m4 = transformation(analyzer.tokenStream(PatentDocument.Claims, ptFields[4]));
    out[0] = m0;
    out[1] = m1;
    out[2] = m2;
    out[3] = m3;
    out[4] = m4;
    return out;
}