List of usage examples for org.apache.lucene.analysis.miscellaneous PerFieldAnalyzerWrapper tokenStream
public final TokenStream tokenStream(final String fieldName, final Reader reader)
fieldName, tokenizing the contents of reader. From source file:com.github.mosuka.apache.lucene.example.utils.LuceneExampleUtilTest.java
License:Apache License
public void testCreateAnalyzerWrapper() throws IOException { PerFieldAnalyzerWrapper wrapper = LuceneExampleUtil.createAnalyzerWrapper(); TokenStream tokenStream = null;/*w ww.j ava 2 s .c om*/ CharTermAttribute charTermAttribute = null; List<String> expectedIdTermList = new LinkedList<String>(Arrays.asList("1")); List<String> actualIdTermList = new LinkedList<String>(); tokenStream = wrapper.tokenStream("id", "1"); charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { actualIdTermList.add(charTermAttribute.toString()); } tokenStream.close(); assertEquals(expectedIdTermList, actualIdTermList); List<String> expectedTextTermList = new LinkedList<String>( Arrays.asList("lucene", "is", "a", "full", "text", "search", "library")); List<String> actualTextTermList = new LinkedList<String>(); tokenStream = wrapper.tokenStream("text", "Lucene is a Full-text search library."); charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { actualTextTermList.add(charTermAttribute.toString()); } tokenStream.close(); assertEquals(expectedTextTermList, actualTextTermList); }
From source file:nicta.com.au.patent.pac.analysis.FieldsCosineSimilarities.java
public final Map<String, Double>[] parse(PatentDocument pt) throws IOException, Exception { Map<String, Double>[] out = new Map[5]; String[] ptFields = new String[5]; String title = ""; String ipc = ""; String abstrac = ""; String description = ""; String claims = ""; for (InventionTitle inventionTitle : pt.getTechnicalData().getInventionTitle()) { if (inventionTitle.getLang().toLowerCase().equals("en")) { title = inventionTitle.getContent(); }//from ww w. j ava2s.com } Map<String, Double> m1 = new HashMap<>(); for (ClassificationIpcr ipcCode : pt.getTechnicalData().getClassificationIpcr()) { StringTokenizer st = new StringTokenizer(ipcCode.getContent()); m1.put(st.nextToken(), 1.0); } if (pt.getAbstrac().getLang() != null && pt.getAbstrac().getLang().toLowerCase().equals("en")) { abstrac = pt.getAbstrac().getContent(); } if (pt.getDescription() != null && pt.getDescription().getLang().toLowerCase().equals("en")) { for (P p : pt.getDescription().getP()) { description += p.getContent() + " "; } } for (Claims cs : pt.getClaims()) { if (cs.getLang().toLowerCase().equals("en")) { for (Claim claim : cs.getClaim()) { claims += claim.getClaimText() + " "; } } } ptFields[0] = title; ptFields[1] = ipc; ptFields[2] = abstrac; ptFields[3] = description; ptFields[4] = claims; Map<String, Analyzer> analyzerPerField = new HashMap<>(); if (specificStopWords == true) { analyzerPerField.put(PatentDocument.Title, new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.TITLE_ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Abstract, new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ABSTRACT_ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Description, new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.DESCRIPTION_ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Claims, new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.CLAIMS_ENGLISH_STOP_WORDS_SET)); } else { analyzerPerField.put(PatentDocument.Title, new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Abstract, new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Description, new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Claims, new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ENGLISH_STOP_WORDS_SET)); } PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.LUCENE_44), analyzerPerField); Map<String, Double> m0 = getVector(analyzer.tokenStream(PatentDocument.Title, ptFields[0]), PatentDocument.Title); Map<String, Double> m2 = getVector(analyzer.tokenStream(PatentDocument.Abstract, ptFields[2]), PatentDocument.Abstract); Map<String, Double> m3 = getVector(analyzer.tokenStream(PatentDocument.Description, ptFields[3]), PatentDocument.Description); Map<String, Double> m4 = getVector(analyzer.tokenStream(PatentDocument.Claims, ptFields[4]), PatentDocument.Claims); out[0] = m0; out[1] = m1; out[2] = m2; out[3] = m3; out[4] = m4; return out; }
From source file:nicta.com.au.patent.pac.analysis.FieldsJaccardSimilarities.java
public final Map<String, Integer>[] parse(PatentDocument pt) throws IOException { Map<String, Integer>[] out = new Map[5]; String[] ptFields = new String[5]; String title = ""; String ipc = ""; String abstrac = ""; String description = ""; String claims = ""; for (InventionTitle inventionTitle : pt.getTechnicalData().getInventionTitle()) { if (inventionTitle.getLang().toLowerCase().equals("en")) { title = inventionTitle.getContent(); }/*w w w .j a v a2 s. c om*/ } Map<String, Integer> m1 = new HashMap<>(); for (ClassificationIpcr ipcCode : pt.getTechnicalData().getClassificationIpcr()) { StringTokenizer st = new StringTokenizer(ipcCode.getContent()); m1.put(st.nextToken(), 1); } if (pt.getAbstrac().getLang() != null && pt.getAbstrac().getLang().toLowerCase().equals("en")) { abstrac = pt.getAbstrac().getContent(); } if (pt.getDescription() != null && pt.getDescription().getLang().toLowerCase().equals("en")) { for (P p : pt.getDescription().getP()) { description += p.getContent() + " "; } } for (Claims cs : pt.getClaims()) { if (cs.getLang().toLowerCase().equals("en")) { for (Claim claim : cs.getClaim()) { claims += claim.getClaimText() + " "; } } } ptFields[0] = title; ptFields[1] = ipc; ptFields[2] = abstrac; ptFields[3] = description; ptFields[4] = claims; Map<String, Analyzer> analyzerPerField = new HashMap<>(); if (specificStopWords == true) { analyzerPerField.put(PatentDocument.Title, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.TITLE_ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Abstract, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ABSTRACT_ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Description, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.DESCRIPTION_ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Claims, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.CLAIMS_ENGLISH_STOP_WORDS_SET)); } else { analyzerPerField.put(PatentDocument.Title, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Abstract, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Description, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Claims, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET)); } PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.LUCENE_48), analyzerPerField); Map<String, Integer> m0 = transformation(analyzer.tokenStream(PatentDocument.Title, ptFields[0])); Map<String, Integer> m2 = transformation(analyzer.tokenStream(PatentDocument.Abstract, ptFields[2])); Map<String, Integer> m3 = transformation(analyzer.tokenStream(PatentDocument.Description, ptFields[3])); Map<String, Integer> m4 = transformation(analyzer.tokenStream(PatentDocument.Claims, ptFields[4])); out[0] = m0; out[1] = m1; out[2] = m2; out[3] = m3; out[4] = m4; return out; }