List of usage examples for org.apache.lucene.analysis.en EnglishAnalyzer EnglishAnalyzer
public EnglishAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet)
From source file:back.Searcher.java
License:Apache License
/** Simple command-line based search demo. */ public static void search(String query, boolean stopword, boolean stemming, int consulta) throws Exception { String index = null;/*from www.j a va2s. co m*/ Analyzer analyzer = null; if (!stopword && !stemming) { index = ".\\indexed"; analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT, new CharArraySet(Version.LUCENE_CURRENT, 0, false)); System.out.println("Nenhum Marcado"); } else if (stopword && !stemming) { index = ".\\indexedNoStpWrd"; analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); System.out.println("Primeiro Marcado"); } else if (!stopword && stemming) { index = ".\\indexedStemming"; analyzer = new EnglishAnalyzer(Version.LUCENE_CURRENT, new CharArraySet(Version.LUCENE_CURRENT, 0, false)); System.out.println("Segundo Marcado"); } else if (stopword && stemming) { index = ".\\indexedTreated"; analyzer = new EnglishAnalyzer(Version.LUCENE_CURRENT); System.out.println("Dois Marcados"); } String field = "contents"; String queries = null; int repeat = 0; boolean raw = false; String queryString = query; int hitsPerPage = 200; CSVReader CSVreader = new CSVReader(new FileReader(".\\matriz.csv")); List<String[]> myEntries = CSVreader.readAll(); IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(index))); IndexSearcher searcher = new IndexSearcher(reader); BufferedReader in = null; if (queries != null) { in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8")); } else { in = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); } QueryParser parser = new QueryParser(Version.LUCENE_40, field, analyzer); while (true) { if (queries == null && queryString == null) { // prompt the user System.out.println("Enter query: "); } String line = queryString != null ? queryString : in.readLine(); if (line == null || line.length() == -1) { break; } line = line.trim(); if (line.length() == 0) { break; } Query query1 = parser.parse(line); System.out.println("Searching for: " + query1.toString(field)); if (repeat > 0) { // repeat & time as benchmark Date start = new Date(); for (int i = 0; i < repeat; i++) { searcher.search(query1, null, 100); } Date end = new Date(); System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms"); } doPagingSearch(in, searcher, query1, hitsPerPage, raw, queries == null && queryString == null, myEntries, consulta); if (queryString != null) { break; } } reader.close(); }
From source file:com.tilab.ca.sse.core.lucene.IndexesUtil.java
License:Open Source License
/** * Initialize the classifiers. This static method initializes the italian * and the english classifiers under the hood. You must call this function * after you have constructed an instance of the SSEVariables class as * described in SSEVariables docs./* ww w .j a v a 2 s .c om*/ * * If you don't call this method, when you use the classifier you will get a * NullPointerException in Classifier(). * * @since 2.0.0.0. */ public static void init() { LOG.debug("[initializator] - BEGIN"); sseConfigFromCache = ConfigCache.getOrCreate(SSEConfig.class); ITALIAN_CORPUS_INDEX_SEARCHER = indexLoading(() -> { // build italian searcher Directory contextIndexDirIT = LuceneManager.pickDirectory(new File(sseConfigFromCache.corpusIndexIT())); LOG.info("Corpus index used for italian: " + contextIndexDirIT); LuceneManager contextLuceneManagerIT = new LuceneManager(contextIndexDirIT); contextLuceneManagerIT.setLuceneDefaultAnalyzer( new ItalianAnalyzer(Version.LUCENE_36, getStopWords(sseConfigFromCache.stopWordsIT()))); return new SimpleSearcher(contextLuceneManagerIT); }).orElse(null); //FIXME not a good use of Optional -> use a default SimpleSearcher ENGLISH_CORPUS_INDEX_SEARCHER = indexLoading(() -> { // build english searcher Directory contextIndexDirEN = LuceneManager.pickDirectory(new File(sseConfigFromCache.corpusIndexEN())); LOG.info("Corpus index used for english: " + contextIndexDirEN); LuceneManager contextLuceneManagerEN = new LuceneManager(contextIndexDirEN); contextLuceneManagerEN.setLuceneDefaultAnalyzer( new EnglishAnalyzer(Version.LUCENE_36, getStopWords(sseConfigFromCache.stopWordsEN()))); return new SimpleSearcher(contextLuceneManagerEN); }).orElse(null); //FIXME not a good use of Optional -> use a default SimpleSearcher if (ITALIAN_CORPUS_INDEX_SEARCHER == null && ENGLISH_CORPUS_INDEX_SEARCHER == null) { throw new RuntimeException("Indexes not available"); } LOG.debug("[initializator] - END"); }
From source file:edu.stanford.muse.index.Indexer.java
License:Apache License
/** * main entry point for indexing. note: recomputeCards has to be called * separately// w w w. ja va2s . c o m */ /* * void processDocumentCollection(List<MultiDoc> mDocs, List<Document> docs, * BlobStore blobStore) throws Exception { log.info ("Processing " + * docs.size() + " documents"); try { indexDocumentCollection(mDocs, docs, * blobStore); } catch (OutOfMemoryError oome) { log.error * ("Sorry, out of memory, results may be incomplete!"); clear(); } } * * /** preprocessed and indexes the docs. */ /* * private void indexDocumentCollection(List<MultiDoc> mDocs, List<Document> * allDocs, BlobStore blobStore) throws Exception { this.clear(); * currentJobStartTimeMillis = System.currentTimeMillis(); * currentJobDocsetSize = allDocs.size(); currentJobDocsProcessed = * currentJobErrors = 0; * * System.gc(); String stat1 = "Memory status before indexing " + * allDocs.size() + " documents: " + Util.getMemoryStats(); log.info * (stat1); docClusters = mDocs; * * if (io.do_NER) NER.printAllTypes(); * * computeClusterStats(mDocs); log.info ("Indexing " + allDocs.size() + * " documents in " + docClusters.size() + " clusters"); int clusterCount = * -1; int docsIndexed = 0, multiDocsIndexed = 0; Posting.nPostingsAllocated * = 0; docClusters = mDocs; * * try { for (MultiDoc md: docClusters) { clusterCount++; log.info * ("-----------------------------"); log.info ("Indexing " + md.docs.size() * + " documents in document cluster #" + clusterCount + ": " + * md.description); * * for (Document d: md.docs) { if (cancel) throw new CancelledException(); * * String contents = ""; if (!io.ignoreDocumentBody) { try { contents = * d.getContents(); } catch (Exception e) { markDataError * ("Exception trying to read " + d + ": " + e); } } * * if (contents.length() > MAX_DOCUMENT_SIZE) { markDataError * ("Document too long, size " + Util.commatize(contents.length()) + * " bytes, dropping it. Begins with: " + d + Util.ellipsize(contents, 80)); * contents = ""; } * * String subject = d.getSubjectWithoutTitle(); subject = * EmailUtils.cleanupSubjectLine(subject); * * indexSubdoc(subject, contents, d, blobStore); * * docsIndexed++; currentJobDocsProcessed++; } // end cluster * * log.info ("Finished indexing multi doc " + md); if (md.docs.size() > 0) * log.info ("Current stats:" + computeStats()); * * multiDocsIndexed++; // IndexUtils.dumpDocument(clusterPrefix, * clusterText); // i don't think we need to do this except for debugging * System.out.toString("."); // goes to console, that's ok... * * if (md.docs.size() > 0) { String stat2 = ("Memory status after indexing " * + docsIndexed + " of " + allDocs.size() + " documents in " + * multiDocsIndexed + " (non-zero) multi-docs, total text length " + * stats.processedTextLength + " chars, " + stats.nProcessedNames + * " names. " + Util.getMemoryStats()); log.info (stat2); } } } catch * (OutOfMemoryError oome) { String s = * "REAL WARNING! SEVERE WARNING! Out of memory during indexing. Please retry with more memory!" * + oome; s += "\n"; log.error (s); // option: heroically soldier on and * try to work with partial results } * * // imp: do this at the end to save memory. doesn't save memory during * indexing but saves mem later, when the index is being used. // esp. * important for lens. NER.release_classifier(); // release memory for * classifier log.info ("Memory status after releasing classifier: " + * Util.getMemoryStats()); packIndex(); * * return; } */ private Analyzer newAnalyzer() { // we can use LimitTokenCountAnalyzer to limit the #tokens EnglishAnalyzer stemmingAnalyzer = new EnglishAnalyzer(LUCENE_VERSION, MUSE_STOP_WORDS_SET); EnglishNumberAnalyzer snAnalyzer = new EnglishNumberAnalyzer(LUCENE_VERSION, MUSE_STOP_WORDS_SET); // these are the 3 fields for stemming, everything else uses StandardAnalyzer Map<String, Analyzer> map = new LinkedHashMap<String, Analyzer>(); map.put("body", snAnalyzer); map.put("title", snAnalyzer); map.put("body_original", stemmingAnalyzer); KeywordAnalyzer keywordAnalyzer = new KeywordAnalyzer(); // actually these do not need any real analyzer, they are just stored opaquely map.put("docId", keywordAnalyzer); map.put("names_offsets", keywordAnalyzer); //body redacted contains only names and a lot of dots, hence it requires special handling. // if(ModeConfig.isPublicMode()) { // map.put("body", new Analyzer() { // @Override // protected TokenStreamComponents createComponents(final String fieldName, // final Reader reader) { // Version matchVersion = Indexer.LUCENE_VERSION; // final CICTokenizer source = new StandardNumberTokenizer(matchVersion, reader); // TokenStream result = new LowerCaseFilter(matchVersion, source); // return new TokenStreamComponents(source, result); // } // }); // } //do not remove any stop words. StandardAnalyzer standardAnalyzer = new StandardAnalyzer(LUCENE_VERSION, CharArraySet.EMPTY_SET); return new PerFieldAnalyzerWrapper(standardAnalyzer, map); }
From source file:indexing.WTDocIndexer.java
@Override
Analyzer constructAnalyzer() {// www .jav a 2s . co m
Analyzer defaultAnalyzer = new WhitespaceAnalyzer(Version.LUCENE_4_9);
Map<String, Analyzer> anmap = new HashMap<String, Analyzer>();
Analyzer enAnalyzer = new EnglishAnalyzer(Version.LUCENE_4_9,
StopFilter.makeStopSet(Version.LUCENE_4_9, buildStopwordList("stopfile"))); // default analyzer
anmap.put(WTDocument.WTDOC_FIELD_TITLE, enAnalyzer);
anmap.put(FIELD_ANALYZED_CONTENT, enAnalyzer);
PerFieldAnalyzerWrapper pfAnalyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer, anmap);
return pfAnalyzer;
}
From source file:it.polito.tellmefirst.lucene.IndexesUtil.java
License:Open Source License
public IndexesUtil() throws TMFIndexesWarmUpException { LOG.debug("[constructor] - BEGIN"); try {/* w w w . ja v a 2 s . c om*/ // build italian searcher Directory contextIndexDirIT = LuceneManager.pickDirectory(new File(TMFVariables.CORPUS_INDEX_IT)); LOG.info("Corpus index used for italian: " + contextIndexDirIT); LuceneManager contextLuceneManagerIT = new LuceneManager(contextIndexDirIT); contextLuceneManagerIT .setLuceneDefaultAnalyzer(new ItalianAnalyzer(Version.LUCENE_36, TMFVariables.STOPWORDS_IT)); ITALIAN_CORPUS_INDEX_SEARCHER = new SimpleSearcher(contextLuceneManagerIT); // build english searcher Directory contextIndexDirEN = LuceneManager.pickDirectory(new File(TMFVariables.CORPUS_INDEX_EN)); LOG.info("Corpus index used for english: " + contextIndexDirEN); LuceneManager contextLuceneManagerEN = new LuceneManager(contextIndexDirEN); contextLuceneManagerEN .setLuceneDefaultAnalyzer(new EnglishAnalyzer(Version.LUCENE_36, TMFVariables.STOPWORDS_EN)); ENGLISH_CORPUS_INDEX_SEARCHER = new SimpleSearcher(contextLuceneManagerIT); } catch (Exception e) { //exceptions are not catched here, because we want to stop TMF server throw new TMFIndexesWarmUpException("Problem with setting up TMF indexes: ", e); } LOG.debug("[constructor] - END"); }
From source file:it.polito.tellmefirst.web.rest.TMFServer.java
License:Open Source License
/** * TMF starting point. From rest directory, launch this command: * mvn exec:java -Dexec.mainClass="it.polito.temefirst.web.rest.TMFServer" -Dexec.args="<path_to_TMF_installation>/conf/server.properties" * or use the run.sh file in bin directory */// w w w. j a va 2 s . c o m public static void main(String[] args) throws TMFConfigurationException, TMFIndexesWarmUpException, URISyntaxException, InterruptedException, IOException { LOG.debug("[main] - BEGIN"); URI serverURI = new URI("http://localhost:2222/rest/"); String configFileName = args[0]; new TMFVariables(configFileName); // XXX I put the code of IndexUtil.init() here, because, for now, I need a reference of SimpleSearchers for the Enhancer // build italian searcher Directory contextIndexDirIT = LuceneManager.pickDirectory(new File(TMFVariables.CORPUS_INDEX_IT)); LOG.info("Corpus index used for italian: " + contextIndexDirIT); LuceneManager contextLuceneManagerIT = new LuceneManager(contextIndexDirIT); contextLuceneManagerIT .setLuceneDefaultAnalyzer(new ItalianAnalyzer(Version.LUCENE_36, TMFVariables.STOPWORDS_IT)); ITALIAN_CORPUS_INDEX_SEARCHER = new SimpleSearcher(contextLuceneManagerIT); // build english searcher Directory contextIndexDirEN = LuceneManager.pickDirectory(new File(TMFVariables.CORPUS_INDEX_EN)); LOG.info("Corpus index used for english: " + contextIndexDirEN); LuceneManager contextLuceneManagerEN = new LuceneManager(contextIndexDirEN); contextLuceneManagerEN .setLuceneDefaultAnalyzer(new EnglishAnalyzer(Version.LUCENE_36, TMFVariables.STOPWORDS_EN)); ENGLISH_CORPUS_INDEX_SEARCHER = new SimpleSearcher(contextLuceneManagerEN); // build kb italian searcher String kbDirIT = TMFVariables.KB_IT; String residualKbDirIT = TMFVariables.RESIDUAL_KB_IT; ITALIAN_KB_INDEX_SEARCHER = new KBIndexSearcher(kbDirIT, residualKbDirIT); // build kb english searcher String kbDirEN = TMFVariables.KB_EN; String residualKbDirEN = TMFVariables.RESIDUAL_KB_EN; ENGLISH_KB_INDEX_SEARCHER = new KBIndexSearcher(kbDirEN, residualKbDirEN); enhancer = new Enhancer(ITALIAN_CORPUS_INDEX_SEARCHER, ENGLISH_CORPUS_INDEX_SEARCHER, ITALIAN_KB_INDEX_SEARCHER, ENGLISH_KB_INDEX_SEARCHER); italianClassifier = new Classifier("it", ITALIAN_CORPUS_INDEX_SEARCHER); englishClassifier = new Classifier("en", ENGLISH_CORPUS_INDEX_SEARCHER); //The following is adapted from DBpedia Spotlight (https://github.com/dbpedia-spotlight/dbpedia-spotlight) final Map<String, String> initParams = new HashMap<String, String>(); initParams.put("com.sun.jersey.config.property.resourceConfigClass", "com.sun.jersey.api.core." + "PackagesResourceConfig"); initParams.put("com.sun.jersey.config.property.packages", "it.polito.tellmefirst.web.rest.services"); initParams.put("com.sun.jersey.config.property.WadlGeneratorConfig", "it.polito.tellmefirst.web.rest.wadl." + "ExternalUriWadlGeneratorConfig"); SelectorThread threadSelector = GrizzlyWebContainerFactory.create(serverURI, initParams); threadSelector.start(); System.err.println("Server started in " + System.getProperty("user.dir") + " listening on " + serverURI); Thread warmUp = new Thread() { public void run() { } }; warmUp.start(); while (running) { Thread.sleep(100); } threadSelector.stopEndpoint(); System.exit(0); LOG.debug("[main] - END"); }
From source file:jetbrains.exodus.lucene.ExodusLuceneTestsBase.java
License:Apache License
protected void removeStopWord(final String stopWord) { final HashSet<Object> stopSet = new HashSet<>(); for (Object word : ((StopwordAnalyzerBase) analyzer).getStopwordSet()) { if (!stopWord.equals(new String((char[]) word))) { stopSet.add(word);/*from ww w . ja v a 2 s . c o m*/ } } analyzer = new EnglishAnalyzer(LUCENE_VERSION, stopSet); }
From source file:nicta.com.au.failureanalysis.optimalquery.OptPatentQuery.java
private void analyze() throws IOException { String title = ""; String ipc = ""; String abstrac = ""; String description = ""; String descriptionP5 = ""; String claims = ""; String claims1 = ""; //******************************************************************** // leveraging Title //******************************************************************** for (InventionTitle inventionTitle : pt.getTechnicalData().getInventionTitle()) { if (inventionTitle.getLang().toLowerCase().equals("en")) { title = inventionTitle.getContent(); }/* w ww .ja v a2s .c o m*/ } //******************************************************************** // leveraging IPC Codes //******************************************************************** Set<String> codes = new HashSet<>(); for (ClassificationIpcr ipcCode : pt.getTechnicalData().getClassificationIpcr()) { StringTokenizer st = new StringTokenizer(ipcCode.getContent()); String p1 = st.nextToken(); String p2 = st.nextToken(); codes.add(p1); fullClassCodes.add(p1 + p2); } for (String code : codes) { if (!ipc.contains(code)) { ipc += code + " "; } } //******************************************************************** // leveraging Abstract //******************************************************************** if (pt.getAbstrac().getLang() != null) { if (pt.getAbstrac().getLang().toLowerCase().equals("en")) { abstrac = pt.getAbstrac().getContent(); } } //******************************************************************** // leveraging Description //******************************************************************** if (pt.getDescription() != null) { if (pt.getDescription().getLang().toLowerCase().equals("en")) { for (P p : pt.getDescription().getP()) { if (Integer.parseInt(p.getNum()) == 1 || Integer.parseInt(p.getNum()) == 2 || Integer.parseInt(p.getNum()) == 3 || Integer.parseInt(p.getNum()) == 4 || Integer.parseInt(p.getNum()) == 5) { // Leveraging first 5 paragraphes descriptionP5 += p.getContent() + " "; } description += p.getContent() + " "; } } } //******************************************************************** // leveraging Claims //******************************************************************** for (Claims cs : pt.getClaims()) { if (cs.getLang().toLowerCase().equals("en")) { for (Claim claim : cs.getClaim()) { if (Integer.parseInt(claim.getNum()) == 1) {// Leveraging Claims 1 claims1 += claim.getClaimText() + " "; } claims += claim.getClaimText() + " "; } } } //******************************************************************** this.queries[0] = ipc; this.queries[1] = title; this.queries[2] = abstrac; this.queries[3] = description; this.queries[4] = descriptionP5; this.queries[5] = claims; this.queries[6] = claims1; Map<String, Analyzer> analyzerPerField = new HashMap<>(); if (stopWords) { /*analyzerPerField.put(PatentDocument.Title, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.TITLE_ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Abstract, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ABSTRACT_ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Description, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.DESCRIPTION_ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Claims, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.CLAIMS_ENGLISH_STOP_WORDS_SET));*/ analyzerPerField.put(PatentDocument.Title, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.UNIFIED_PATENT__ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Abstract, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.UNIFIED_PATENT__ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Description, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.UNIFIED_PATENT__ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Claims, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.UNIFIED_PATENT__ENGLISH_STOP_WORDS_SET)); } else { analyzerPerField.put(PatentDocument.Title, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Abstract, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Description, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Claims, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET)); } analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.LUCENE_48), analyzerPerField); boolean oneNumber = false; int j = -1; for (int i = 1; i < fields.length; i++) { float v = boosts.get(fields[i]); if (oneNumber == false && v > 0) { oneNumber = true; j = i; } else if (oneNumber && v > 0) { oneNumber = false; j = -1; break; } } if (oneNumber) { String qText = queries[j]; String q = transformation(analyzer.tokenStream(fields[j], qText), titleTreshold, PatentDocument.Title); queries[1] = q; System.err.println(queries[1]); boosts.put(PatentDocument.Title, (float) 1.0); q = transformation(analyzer.tokenStream(fields[j], qText), abstractTreshold, PatentDocument.Abstract); queries[2] = q; // System.err.println(queries[2]); boosts.put(PatentDocument.Abstract, (float) 1.0); q = transformation(analyzer.tokenStream(fields[j], qText), descriptionTreshold, PatentDocument.Description); // System.err.println(q); queries[3] = q; boosts.put(PatentDocument.Description, (float) 1.0); queries[4] = ""; q = transformation(analyzer.tokenStream(fields[j], qText), claimsTreshold, PatentDocument.Claims); // System.err.println(q); queries[5] = q; boosts.put(PatentDocument.Claims, (float) 1.0); queries[6] = ""; } else { String[] qText = queries; String q = transformation(analyzer.tokenStream(PatentDocument.Title, qText[1]), titleTreshold, PatentDocument.Title); queries[1] = q; System.err.println(queries[1]); q = transformation(analyzer.tokenStream(PatentDocument.Abstract, qText[2]), abstractTreshold, PatentDocument.Abstract); queries[2] = q; System.err.println(q); q = transformation(analyzer.tokenStream(PatentDocument.Description, qText[3]), descriptionTreshold, PatentDocument.Description); queries[3] = q; System.err.println(q); q = transformation(analyzer.tokenStream(PatentDocument.Description, qText[4]), descriptionTreshold, null); queries[4] = q; System.err.println(q); q = transformation(analyzer.tokenStream(PatentDocument.Claims, qText[5]), claimsTreshold, PatentDocument.Claims); queries[5] = q; System.err.println(q); q = transformation(analyzer.tokenStream(PatentDocument.Claims, qText[6]), claimsTreshold, null); queries[6] = q; System.err.println(q); } }
From source file:nicta.com.au.failureanalysis.query.QueryGneration.java
public Map<String, Integer> getSectionTerms(String section) throws IOException { String title = ""; String ipc = ""; String abstrac = ""; String description = ""; String descriptionP5 = ""; String claims = ""; String claims1 = ""; //******************************************************************** // leveraging Title //******************************************************************** for (InventionTitle inventionTitle : pt.getTechnicalData().getInventionTitle()) { if (inventionTitle.getLang().toLowerCase().equals("en")) { title = inventionTitle.getContent(); }/*from w w w. j av a 2 s . co m*/ } //******************************************************************** // leveraging IPC Codes //******************************************************************** Set<String> codes = new HashSet<>(); for (ClassificationIpcr ipcCode : pt.getTechnicalData().getClassificationIpcr()) { StringTokenizer st = new StringTokenizer(ipcCode.getContent()); String p1 = st.nextToken(); String p2 = st.nextToken(); codes.add(p1); fullClassCodes.add(p1 + p2); } for (String code : codes) { if (!ipc.contains(code)) { ipc += code + " "; } } //******************************************************************** // leveraging Abstract //******************************************************************** if (pt.getAbstrac().getLang() != null) { if (pt.getAbstrac().getLang().toLowerCase().equals("en")) { abstrac = pt.getAbstrac().getContent(); } } //******************************************************************** // leveraging Description //******************************************************************** if (pt.getDescription() != null) { if (pt.getDescription().getLang().toLowerCase().equals("en")) { for (P p : pt.getDescription().getP()) { if (Integer.parseInt(p.getNum()) == 1 || Integer.parseInt(p.getNum()) == 2 || Integer.parseInt(p.getNum()) == 3 || Integer.parseInt(p.getNum()) == 4 || Integer.parseInt(p.getNum()) == 5) { // Leveraging first 5 paragraphes descriptionP5 += p.getContent() + " "; } description += p.getContent() + " "; } } } //******************************************************************** // leveraging Claims //******************************************************************** for (Claims cs : pt.getClaims()) { if (cs.getLang().toLowerCase().equals("en")) { for (Claim claim : cs.getClaim()) { if (Integer.parseInt(claim.getNum()) == 1) {// Leveraging Claims 1 claims1 += claim.getClaimText() + " "; } claims += claim.getClaimText() + " "; } } } //******************************************************************** this.queries[0] = ipc; this.queries[1] = title; this.queries[2] = abstrac; this.queries[3] = description; this.queries[4] = descriptionP5; this.queries[5] = claims; this.queries[6] = claims1; Map<String, Analyzer> analyzerPerField = new HashMap<>(); if (stopWords) { /*analyzerPerField.put(PatentDocument.Title, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.TITLE_ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Abstract, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ABSTRACT_ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Description, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.DESCRIPTION_ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Claims, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.CLAIMS_ENGLISH_STOP_WORDS_SET));*/ analyzerPerField.put(PatentDocument.Title, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.UNIFIED_PATENT__ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Abstract, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.UNIFIED_PATENT__ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Description, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.UNIFIED_PATENT__ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Claims, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.UNIFIED_PATENT__ENGLISH_STOP_WORDS_SET)); } else { analyzerPerField.put(PatentDocument.Title, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Abstract, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Description, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Claims, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET)); } analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.LUCENE_48), analyzerPerField); boolean oneNumber = false; int j = -1; for (int i = 1; i < fields.length; i++) { float v = boosts.get(fields[i]); if (oneNumber == false && v > 0) { oneNumber = true; j = i; } else if (oneNumber && v > 0) { oneNumber = false; j = -1; break; } } /*if (oneNumber) { String qText = queries[j]; String q = transformation(analyzer.tokenStream(fields[j], qText), titleTreshold, PatentDocument.Title); // System.err.println(q); queries[1] = q; boosts.put(PatentDocument.Title, (float) 1.0); q = transformation(analyzer.tokenStream(fields[j], qText), abstractTreshold, PatentDocument.Abstract); // System.err.println(q); queries[2] = q; boosts.put(PatentDocument.Abstract, (float) 1.0); q = transformation(analyzer.tokenStream(fields[j], qText), descriptionTreshold, PatentDocument.Description); // System.err.println(q); queries[3] = q; boosts.put(PatentDocument.Description, (float) 1.0); queries[4] = ""; q = transformation(analyzer.tokenStream(fields[j], qText), claimsTreshold, PatentDocument.Claims); // System.err.println(q); queries[5] = q; boosts.put(PatentDocument.Claims, (float) 1.0); queries[6] = ""; } else {*/ String[] qText = queries; String q = transformation(analyzer.tokenStream(PatentDocument.Title, qText[1]), titleTreshold, PatentDocument.Title); Map<String, Integer> t = getTerms(analyzer.tokenStream(PatentDocument.Title, qText[1]), titleTreshold, PatentDocument.Title); // System.err.println(q); queries[1] = q; q = transformation(analyzer.tokenStream(PatentDocument.Abstract, qText[2]), abstractTreshold, PatentDocument.Abstract); Map<String, Integer> a = getTerms(analyzer.tokenStream(PatentDocument.Abstract, qText[2]), abstractTreshold, PatentDocument.Abstract); // System.err.println(q); queries[2] = q; q = transformation(analyzer.tokenStream(PatentDocument.Description, qText[3]), descriptionTreshold, PatentDocument.Description); Map<String, Integer> d = getTerms(analyzer.tokenStream(PatentDocument.Description, qText[3]), descriptionTreshold, PatentDocument.Description); // System.err.println(q); queries[3] = q; q = transformation(analyzer.tokenStream(PatentDocument.Description, qText[4]), descriptionTreshold, null); queries[4] = q; q = transformation(analyzer.tokenStream(PatentDocument.Claims, qText[5]), claimsTreshold, PatentDocument.Claims); Map<String, Integer> c = getTerms(analyzer.tokenStream(PatentDocument.Claims, qText[5]), claimsTreshold, PatentDocument.Claims); // System.err.println(q); queries[5] = q; q = transformation(analyzer.tokenStream(PatentDocument.Claims, qText[6]), claimsTreshold, null); queries[6] = q; if (section.equals("title")) { // System.err.println(q); return t; } else { if (section.equals("abstract")) { return a; } else { if (section.equals("description")) { return d; } else { return c; } } } // return t; // } }
From source file:nicta.com.au.patent.pac.analysis.FieldsCosineSimilarities.java
public final Map<String, Double>[] parse(PatentDocument pt) throws IOException, Exception { Map<String, Double>[] out = new Map[5]; String[] ptFields = new String[5]; String title = ""; String ipc = ""; String abstrac = ""; String description = ""; String claims = ""; for (InventionTitle inventionTitle : pt.getTechnicalData().getInventionTitle()) { if (inventionTitle.getLang().toLowerCase().equals("en")) { title = inventionTitle.getContent(); }//from ww w . jav a 2s.c o m } Map<String, Double> m1 = new HashMap<>(); for (ClassificationIpcr ipcCode : pt.getTechnicalData().getClassificationIpcr()) { StringTokenizer st = new StringTokenizer(ipcCode.getContent()); m1.put(st.nextToken(), 1.0); } if (pt.getAbstrac().getLang() != null && pt.getAbstrac().getLang().toLowerCase().equals("en")) { abstrac = pt.getAbstrac().getContent(); } if (pt.getDescription() != null && pt.getDescription().getLang().toLowerCase().equals("en")) { for (P p : pt.getDescription().getP()) { description += p.getContent() + " "; } } for (Claims cs : pt.getClaims()) { if (cs.getLang().toLowerCase().equals("en")) { for (Claim claim : cs.getClaim()) { claims += claim.getClaimText() + " "; } } } ptFields[0] = title; ptFields[1] = ipc; ptFields[2] = abstrac; ptFields[3] = description; ptFields[4] = claims; Map<String, Analyzer> analyzerPerField = new HashMap<>(); if (specificStopWords == true) { analyzerPerField.put(PatentDocument.Title, new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.TITLE_ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Abstract, new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ABSTRACT_ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Description, new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.DESCRIPTION_ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Claims, new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.CLAIMS_ENGLISH_STOP_WORDS_SET)); } else { analyzerPerField.put(PatentDocument.Title, new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Abstract, new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Description, new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ENGLISH_STOP_WORDS_SET)); analyzerPerField.put(PatentDocument.Claims, new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ENGLISH_STOP_WORDS_SET)); } PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.LUCENE_44), analyzerPerField); Map<String, Double> m0 = getVector(analyzer.tokenStream(PatentDocument.Title, ptFields[0]), PatentDocument.Title); Map<String, Double> m2 = getVector(analyzer.tokenStream(PatentDocument.Abstract, ptFields[2]), PatentDocument.Abstract); Map<String, Double> m3 = getVector(analyzer.tokenStream(PatentDocument.Description, ptFields[3]), PatentDocument.Description); Map<String, Double> m4 = getVector(analyzer.tokenStream(PatentDocument.Claims, ptFields[4]), PatentDocument.Claims); out[0] = m0; out[1] = m1; out[2] = m2; out[3] = m3; out[4] = m4; return out; }