Example usage for org.apache.lucene.analysis.en EnglishAnalyzer EnglishAnalyzer

List of usage examples for org.apache.lucene.analysis.en EnglishAnalyzer EnglishAnalyzer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.en EnglishAnalyzer EnglishAnalyzer.

Prototype

public EnglishAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) 

Source Link

Document

Builds an analyzer with the given stop words.

Usage

From source file:back.Searcher.java

License:Apache License

/** Simple command-line based search demo. */
public static void search(String query, boolean stopword, boolean stemming, int consulta) throws Exception {

    String index = null;/*from  www.j  a va2s. co m*/
    Analyzer analyzer = null;
    if (!stopword && !stemming) {
        index = ".\\indexed";
        analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT,
                new CharArraySet(Version.LUCENE_CURRENT, 0, false));
        System.out.println("Nenhum Marcado");
    } else if (stopword && !stemming) {
        index = ".\\indexedNoStpWrd";
        analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
        System.out.println("Primeiro Marcado");

    } else if (!stopword && stemming) {
        index = ".\\indexedStemming";
        analyzer = new EnglishAnalyzer(Version.LUCENE_CURRENT,
                new CharArraySet(Version.LUCENE_CURRENT, 0, false));
        System.out.println("Segundo Marcado");

    } else if (stopword && stemming) {
        index = ".\\indexedTreated";
        analyzer = new EnglishAnalyzer(Version.LUCENE_CURRENT);
        System.out.println("Dois Marcados");

    }
    String field = "contents";
    String queries = null;
    int repeat = 0;
    boolean raw = false;
    String queryString = query;
    int hitsPerPage = 200;

    CSVReader CSVreader = new CSVReader(new FileReader(".\\matriz.csv"));
    List<String[]> myEntries = CSVreader.readAll();

    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(index)));
    IndexSearcher searcher = new IndexSearcher(reader);

    BufferedReader in = null;
    if (queries != null) {
        in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8"));
    } else {
        in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
    }
    QueryParser parser = new QueryParser(Version.LUCENE_40, field, analyzer);
    while (true) {
        if (queries == null && queryString == null) { // prompt the user
            System.out.println("Enter query: ");
        }

        String line = queryString != null ? queryString : in.readLine();

        if (line == null || line.length() == -1) {
            break;
        }

        line = line.trim();
        if (line.length() == 0) {
            break;
        }

        Query query1 = parser.parse(line);
        System.out.println("Searching for: " + query1.toString(field));

        if (repeat > 0) { // repeat & time as benchmark
            Date start = new Date();
            for (int i = 0; i < repeat; i++) {
                searcher.search(query1, null, 100);
            }
            Date end = new Date();
            System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms");
        }

        doPagingSearch(in, searcher, query1, hitsPerPage, raw, queries == null && queryString == null,
                myEntries, consulta);

        if (queryString != null) {
            break;
        }
    }
    reader.close();
}

From source file:com.tilab.ca.sse.core.lucene.IndexesUtil.java

License:Open Source License

/**
 * Initialize the classifiers. This static method initializes the italian
 * and the english classifiers under the hood. You must call this function
 * after you have constructed an instance of the SSEVariables class as
 * described in SSEVariables docs./* ww  w  .j a v a  2  s  .c om*/
 *
 * If you don't call this method, when you use the classifier you will get a
 * NullPointerException in Classifier().
 *
 * @since 2.0.0.0.
 */
public static void init() {
    LOG.debug("[initializator] - BEGIN");

    sseConfigFromCache = ConfigCache.getOrCreate(SSEConfig.class);

    ITALIAN_CORPUS_INDEX_SEARCHER = indexLoading(() -> {
        // build italian searcher
        Directory contextIndexDirIT = LuceneManager.pickDirectory(new File(sseConfigFromCache.corpusIndexIT()));
        LOG.info("Corpus index used for italian: " + contextIndexDirIT);
        LuceneManager contextLuceneManagerIT = new LuceneManager(contextIndexDirIT);
        contextLuceneManagerIT.setLuceneDefaultAnalyzer(
                new ItalianAnalyzer(Version.LUCENE_36, getStopWords(sseConfigFromCache.stopWordsIT())));
        return new SimpleSearcher(contextLuceneManagerIT);
    }).orElse(null); //FIXME not a good use of Optional -> use a default SimpleSearcher

    ENGLISH_CORPUS_INDEX_SEARCHER = indexLoading(() -> {
        // build english searcher
        Directory contextIndexDirEN = LuceneManager.pickDirectory(new File(sseConfigFromCache.corpusIndexEN()));
        LOG.info("Corpus index used for english: " + contextIndexDirEN);
        LuceneManager contextLuceneManagerEN = new LuceneManager(contextIndexDirEN);
        contextLuceneManagerEN.setLuceneDefaultAnalyzer(
                new EnglishAnalyzer(Version.LUCENE_36, getStopWords(sseConfigFromCache.stopWordsEN())));
        return new SimpleSearcher(contextLuceneManagerEN);
    }).orElse(null); //FIXME not a good use of Optional -> use a default SimpleSearcher

    if (ITALIAN_CORPUS_INDEX_SEARCHER == null && ENGLISH_CORPUS_INDEX_SEARCHER == null) {
        throw new RuntimeException("Indexes not available");
    }

    LOG.debug("[initializator] - END");
}

From source file:edu.stanford.muse.index.Indexer.java

License:Apache License

/**
 * main entry point for indexing. note: recomputeCards has to be called
 * separately//  w w w.  ja  va2s  . c o  m
 */
/*
 * void processDocumentCollection(List<MultiDoc> mDocs, List<Document> docs,
 * BlobStore blobStore) throws Exception { log.info ("Processing " +
 * docs.size() + " documents"); try { indexDocumentCollection(mDocs, docs,
 * blobStore); } catch (OutOfMemoryError oome) { log.error
 * ("Sorry, out of memory, results may be incomplete!"); clear(); } }
 * 
 * /** preprocessed and indexes the docs.
 */
/*
 * private void indexDocumentCollection(List<MultiDoc> mDocs, List<Document>
 * allDocs, BlobStore blobStore) throws Exception { this.clear();
 * currentJobStartTimeMillis = System.currentTimeMillis();
 * currentJobDocsetSize = allDocs.size(); currentJobDocsProcessed =
 * currentJobErrors = 0;
 * 
 * System.gc(); String stat1 = "Memory status before indexing " +
 * allDocs.size() + " documents: " + Util.getMemoryStats(); log.info
 * (stat1); docClusters = mDocs;
 * 
 * if (io.do_NER) NER.printAllTypes();
 * 
 * computeClusterStats(mDocs); log.info ("Indexing " + allDocs.size() +
 * " documents in " + docClusters.size() + " clusters"); int clusterCount =
 * -1; int docsIndexed = 0, multiDocsIndexed = 0; Posting.nPostingsAllocated
 * = 0; docClusters = mDocs;
 * 
 * try { for (MultiDoc md: docClusters) { clusterCount++; log.info
 * ("-----------------------------"); log.info ("Indexing " + md.docs.size()
 * + " documents in document cluster #" + clusterCount + ": " +
 * md.description);
 * 
 * for (Document d: md.docs) { if (cancel) throw new CancelledException();
 * 
 * String contents = ""; if (!io.ignoreDocumentBody) { try { contents =
 * d.getContents(); } catch (Exception e) { markDataError
 * ("Exception trying to read " + d + ": " + e); } }
 * 
 * if (contents.length() > MAX_DOCUMENT_SIZE) { markDataError
 * ("Document too long, size " + Util.commatize(contents.length()) +
 * " bytes, dropping it. Begins with: " + d + Util.ellipsize(contents, 80));
 * contents = ""; }
 * 
 * String subject = d.getSubjectWithoutTitle(); subject =
 * EmailUtils.cleanupSubjectLine(subject);
 * 
 * indexSubdoc(subject, contents, d, blobStore);
 * 
 * docsIndexed++; currentJobDocsProcessed++; } // end cluster
 * 
 * log.info ("Finished indexing multi doc " + md); if (md.docs.size() > 0)
 * log.info ("Current stats:" + computeStats());
 * 
 * multiDocsIndexed++; // IndexUtils.dumpDocument(clusterPrefix,
 * clusterText); // i don't think we need to do this except for debugging
 * System.out.toString("."); // goes to console, that's ok...
 * 
 * if (md.docs.size() > 0) { String stat2 = ("Memory status after indexing "
 * + docsIndexed + " of " + allDocs.size() + " documents in " +
 * multiDocsIndexed + " (non-zero) multi-docs, total text length " +
 * stats.processedTextLength + " chars, " + stats.nProcessedNames +
 * " names. " + Util.getMemoryStats()); log.info (stat2); } } } catch
 * (OutOfMemoryError oome) { String s =
 * "REAL WARNING! SEVERE WARNING! Out of memory during indexing. Please retry with more memory!"
 * + oome; s += "\n"; log.error (s); // option: heroically soldier on and
 * try to work with partial results }
 * 
 * // imp: do this at the end to save memory. doesn't save memory during
 * indexing but saves mem later, when the index is being used. // esp.
 * important for lens. NER.release_classifier(); // release memory for
 * classifier log.info ("Memory status after releasing classifier: " +
 * Util.getMemoryStats()); packIndex();
 * 
 * return; }
 */

private Analyzer newAnalyzer() {
    // we can use LimitTokenCountAnalyzer to limit the #tokens

    EnglishAnalyzer stemmingAnalyzer = new EnglishAnalyzer(LUCENE_VERSION, MUSE_STOP_WORDS_SET);
    EnglishNumberAnalyzer snAnalyzer = new EnglishNumberAnalyzer(LUCENE_VERSION, MUSE_STOP_WORDS_SET);

    // these are the 3 fields for stemming, everything else uses StandardAnalyzer
    Map<String, Analyzer> map = new LinkedHashMap<String, Analyzer>();
    map.put("body", snAnalyzer);
    map.put("title", snAnalyzer);
    map.put("body_original", stemmingAnalyzer);

    KeywordAnalyzer keywordAnalyzer = new KeywordAnalyzer();
    // actually these do not need any real analyzer, they are just stored opaquely
    map.put("docId", keywordAnalyzer);
    map.put("names_offsets", keywordAnalyzer);
    //body redacted contains only  names and a lot of dots, hence it requires special handling.
    //        if(ModeConfig.isPublicMode()) {
    //            map.put("body", new Analyzer() {
    //                @Override
    //                protected TokenStreamComponents createComponents(final String fieldName,
    //                                                                 final Reader reader) {
    //                    Version matchVersion = Indexer.LUCENE_VERSION;
    //                    final CICTokenizer source = new StandardNumberTokenizer(matchVersion, reader);
    //                    TokenStream result = new LowerCaseFilter(matchVersion, source);
    //                    return new TokenStreamComponents(source, result);
    //                }
    //            });
    //        }

    //do not remove any stop words.
    StandardAnalyzer standardAnalyzer = new StandardAnalyzer(LUCENE_VERSION, CharArraySet.EMPTY_SET);

    return new PerFieldAnalyzerWrapper(standardAnalyzer, map);
}

From source file:indexing.WTDocIndexer.java

@Override
Analyzer constructAnalyzer() {// www  .jav a 2s  . co  m
    Analyzer defaultAnalyzer = new WhitespaceAnalyzer(Version.LUCENE_4_9);
    Map<String, Analyzer> anmap = new HashMap<String, Analyzer>();
    Analyzer enAnalyzer = new EnglishAnalyzer(Version.LUCENE_4_9,
            StopFilter.makeStopSet(Version.LUCENE_4_9, buildStopwordList("stopfile"))); // default analyzer

    anmap.put(WTDocument.WTDOC_FIELD_TITLE, enAnalyzer);
    anmap.put(FIELD_ANALYZED_CONTENT, enAnalyzer);

    PerFieldAnalyzerWrapper pfAnalyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer, anmap);
    return pfAnalyzer;
}

From source file:it.polito.tellmefirst.lucene.IndexesUtil.java

License:Open Source License

public IndexesUtil() throws TMFIndexesWarmUpException {
    LOG.debug("[constructor] - BEGIN");
    try {/*  w  w w  . ja v  a  2  s  . c  om*/
        // build italian searcher
        Directory contextIndexDirIT = LuceneManager.pickDirectory(new File(TMFVariables.CORPUS_INDEX_IT));
        LOG.info("Corpus index used for italian: " + contextIndexDirIT);
        LuceneManager contextLuceneManagerIT = new LuceneManager(contextIndexDirIT);
        contextLuceneManagerIT
                .setLuceneDefaultAnalyzer(new ItalianAnalyzer(Version.LUCENE_36, TMFVariables.STOPWORDS_IT));
        ITALIAN_CORPUS_INDEX_SEARCHER = new SimpleSearcher(contextLuceneManagerIT);

        // build english searcher
        Directory contextIndexDirEN = LuceneManager.pickDirectory(new File(TMFVariables.CORPUS_INDEX_EN));
        LOG.info("Corpus index used for english: " + contextIndexDirEN);
        LuceneManager contextLuceneManagerEN = new LuceneManager(contextIndexDirEN);
        contextLuceneManagerEN
                .setLuceneDefaultAnalyzer(new EnglishAnalyzer(Version.LUCENE_36, TMFVariables.STOPWORDS_EN));
        ENGLISH_CORPUS_INDEX_SEARCHER = new SimpleSearcher(contextLuceneManagerIT);
    } catch (Exception e) {
        //exceptions are not catched here, because we want to stop TMF server
        throw new TMFIndexesWarmUpException("Problem with setting up TMF indexes: ", e);
    }
    LOG.debug("[constructor] - END");
}

From source file:it.polito.tellmefirst.web.rest.TMFServer.java

License:Open Source License

/**
 * TMF starting point. From rest directory, launch this command:
 * mvn exec:java -Dexec.mainClass="it.polito.temefirst.web.rest.TMFServer" -Dexec.args="<path_to_TMF_installation>/conf/server.properties"
 * or use the run.sh file in bin directory
 *///  w  w w. j  a  va  2  s . c  o m
public static void main(String[] args) throws TMFConfigurationException, TMFIndexesWarmUpException,
        URISyntaxException, InterruptedException, IOException {
    LOG.debug("[main] - BEGIN");
    URI serverURI = new URI("http://localhost:2222/rest/");
    String configFileName = args[0];
    new TMFVariables(configFileName);

    // XXX I put the code of IndexUtil.init() here, because, for now, I need a reference of SimpleSearchers for the Enhancer

    // build italian searcher
    Directory contextIndexDirIT = LuceneManager.pickDirectory(new File(TMFVariables.CORPUS_INDEX_IT));
    LOG.info("Corpus index used for italian: " + contextIndexDirIT);
    LuceneManager contextLuceneManagerIT = new LuceneManager(contextIndexDirIT);
    contextLuceneManagerIT
            .setLuceneDefaultAnalyzer(new ItalianAnalyzer(Version.LUCENE_36, TMFVariables.STOPWORDS_IT));
    ITALIAN_CORPUS_INDEX_SEARCHER = new SimpleSearcher(contextLuceneManagerIT);

    // build english searcher
    Directory contextIndexDirEN = LuceneManager.pickDirectory(new File(TMFVariables.CORPUS_INDEX_EN));
    LOG.info("Corpus index used for english: " + contextIndexDirEN);
    LuceneManager contextLuceneManagerEN = new LuceneManager(contextIndexDirEN);
    contextLuceneManagerEN
            .setLuceneDefaultAnalyzer(new EnglishAnalyzer(Version.LUCENE_36, TMFVariables.STOPWORDS_EN));
    ENGLISH_CORPUS_INDEX_SEARCHER = new SimpleSearcher(contextLuceneManagerEN);

    // build kb italian searcher
    String kbDirIT = TMFVariables.KB_IT;
    String residualKbDirIT = TMFVariables.RESIDUAL_KB_IT;
    ITALIAN_KB_INDEX_SEARCHER = new KBIndexSearcher(kbDirIT, residualKbDirIT);

    // build kb english searcher
    String kbDirEN = TMFVariables.KB_EN;
    String residualKbDirEN = TMFVariables.RESIDUAL_KB_EN;
    ENGLISH_KB_INDEX_SEARCHER = new KBIndexSearcher(kbDirEN, residualKbDirEN);

    enhancer = new Enhancer(ITALIAN_CORPUS_INDEX_SEARCHER, ENGLISH_CORPUS_INDEX_SEARCHER,
            ITALIAN_KB_INDEX_SEARCHER, ENGLISH_KB_INDEX_SEARCHER);

    italianClassifier = new Classifier("it", ITALIAN_CORPUS_INDEX_SEARCHER);
    englishClassifier = new Classifier("en", ENGLISH_CORPUS_INDEX_SEARCHER);

    //The following is adapted from DBpedia Spotlight (https://github.com/dbpedia-spotlight/dbpedia-spotlight)
    final Map<String, String> initParams = new HashMap<String, String>();
    initParams.put("com.sun.jersey.config.property.resourceConfigClass",
            "com.sun.jersey.api.core." + "PackagesResourceConfig");
    initParams.put("com.sun.jersey.config.property.packages", "it.polito.tellmefirst.web.rest.services");
    initParams.put("com.sun.jersey.config.property.WadlGeneratorConfig",
            "it.polito.tellmefirst.web.rest.wadl." + "ExternalUriWadlGeneratorConfig");
    SelectorThread threadSelector = GrizzlyWebContainerFactory.create(serverURI, initParams);
    threadSelector.start();
    System.err.println("Server started in " + System.getProperty("user.dir") + " listening on " + serverURI);
    Thread warmUp = new Thread() {
        public void run() {
        }
    };
    warmUp.start();
    while (running) {
        Thread.sleep(100);
    }
    threadSelector.stopEndpoint();
    System.exit(0);
    LOG.debug("[main] - END");
}

From source file:jetbrains.exodus.lucene.ExodusLuceneTestsBase.java

License:Apache License

protected void removeStopWord(final String stopWord) {
    final HashSet<Object> stopSet = new HashSet<>();
    for (Object word : ((StopwordAnalyzerBase) analyzer).getStopwordSet()) {
        if (!stopWord.equals(new String((char[]) word))) {
            stopSet.add(word);/*from ww w .  ja  v a 2 s . c  o  m*/
        }
    }
    analyzer = new EnglishAnalyzer(LUCENE_VERSION, stopSet);
}

From source file:nicta.com.au.failureanalysis.optimalquery.OptPatentQuery.java

private void analyze() throws IOException {
    String title = "";
    String ipc = "";
    String abstrac = "";
    String description = "";
    String descriptionP5 = "";
    String claims = "";
    String claims1 = "";

    //********************************************************************
    // leveraging Title
    //********************************************************************
    for (InventionTitle inventionTitle : pt.getTechnicalData().getInventionTitle()) {
        if (inventionTitle.getLang().toLowerCase().equals("en")) {
            title = inventionTitle.getContent();
        }/*  w ww  .ja v  a2s  .c  o  m*/
    }
    //********************************************************************
    // leveraging IPC Codes
    //********************************************************************
    Set<String> codes = new HashSet<>();
    for (ClassificationIpcr ipcCode : pt.getTechnicalData().getClassificationIpcr()) {
        StringTokenizer st = new StringTokenizer(ipcCode.getContent());
        String p1 = st.nextToken();
        String p2 = st.nextToken();
        codes.add(p1);
        fullClassCodes.add(p1 + p2);
    }
    for (String code : codes) {
        if (!ipc.contains(code)) {
            ipc += code + " ";
        }
    }
    //********************************************************************
    // leveraging Abstract
    //********************************************************************
    if (pt.getAbstrac().getLang() != null) {
        if (pt.getAbstrac().getLang().toLowerCase().equals("en")) {
            abstrac = pt.getAbstrac().getContent();
        }
    }
    //********************************************************************
    // leveraging Description
    //********************************************************************
    if (pt.getDescription() != null) {
        if (pt.getDescription().getLang().toLowerCase().equals("en")) {
            for (P p : pt.getDescription().getP()) {
                if (Integer.parseInt(p.getNum()) == 1 || Integer.parseInt(p.getNum()) == 2
                        || Integer.parseInt(p.getNum()) == 3 || Integer.parseInt(p.getNum()) == 4
                        || Integer.parseInt(p.getNum()) == 5) { // Leveraging first 5 paragraphes
                    descriptionP5 += p.getContent() + " ";
                }
                description += p.getContent() + " ";
            }
        }
    }
    //********************************************************************
    // leveraging Claims
    //********************************************************************
    for (Claims cs : pt.getClaims()) {
        if (cs.getLang().toLowerCase().equals("en")) {
            for (Claim claim : cs.getClaim()) {
                if (Integer.parseInt(claim.getNum()) == 1) {// Leveraging Claims 1
                    claims1 += claim.getClaimText() + " ";
                }
                claims += claim.getClaimText() + " ";
            }
        }
    }
    //********************************************************************
    this.queries[0] = ipc;
    this.queries[1] = title;
    this.queries[2] = abstrac;
    this.queries[3] = description;
    this.queries[4] = descriptionP5;
    this.queries[5] = claims;
    this.queries[6] = claims1;
    Map<String, Analyzer> analyzerPerField = new HashMap<>();
    if (stopWords) {
        /*analyzerPerField.put(PatentDocument.Title, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.TITLE_ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Abstract, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ABSTRACT_ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Description, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.DESCRIPTION_ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Claims, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.CLAIMS_ENGLISH_STOP_WORDS_SET));*/

        analyzerPerField.put(PatentDocument.Title, new EnglishAnalyzer(Version.LUCENE_48,
                PatentsStopWords.UNIFIED_PATENT__ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Abstract, new EnglishAnalyzer(Version.LUCENE_48,
                PatentsStopWords.UNIFIED_PATENT__ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Description, new EnglishAnalyzer(Version.LUCENE_48,
                PatentsStopWords.UNIFIED_PATENT__ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Claims, new EnglishAnalyzer(Version.LUCENE_48,
                PatentsStopWords.UNIFIED_PATENT__ENGLISH_STOP_WORDS_SET));
    } else {
        analyzerPerField.put(PatentDocument.Title,
                new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Abstract,
                new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Description,
                new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Claims,
                new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
    }
    analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.LUCENE_48), analyzerPerField);

    boolean oneNumber = false;
    int j = -1;
    for (int i = 1; i < fields.length; i++) {
        float v = boosts.get(fields[i]);
        if (oneNumber == false && v > 0) {
            oneNumber = true;
            j = i;
        } else if (oneNumber && v > 0) {
            oneNumber = false;
            j = -1;
            break;
        }
    }
    if (oneNumber) {
        String qText = queries[j];
        String q = transformation(analyzer.tokenStream(fields[j], qText), titleTreshold, PatentDocument.Title);
        queries[1] = q;
        System.err.println(queries[1]);
        boosts.put(PatentDocument.Title, (float) 1.0);

        q = transformation(analyzer.tokenStream(fields[j], qText), abstractTreshold, PatentDocument.Abstract);
        queries[2] = q;
        //            System.err.println(queries[2]);
        boosts.put(PatentDocument.Abstract, (float) 1.0);
        q = transformation(analyzer.tokenStream(fields[j], qText), descriptionTreshold,
                PatentDocument.Description);
        //            System.err.println(q);
        queries[3] = q;
        boosts.put(PatentDocument.Description, (float) 1.0);
        queries[4] = "";
        q = transformation(analyzer.tokenStream(fields[j], qText), claimsTreshold, PatentDocument.Claims);
        //            System.err.println(q);
        queries[5] = q;
        boosts.put(PatentDocument.Claims, (float) 1.0);
        queries[6] = "";
    } else {
        String[] qText = queries;
        String q = transformation(analyzer.tokenStream(PatentDocument.Title, qText[1]), titleTreshold,
                PatentDocument.Title);
        queries[1] = q;
        System.err.println(queries[1]);
        q = transformation(analyzer.tokenStream(PatentDocument.Abstract, qText[2]), abstractTreshold,
                PatentDocument.Abstract);
        queries[2] = q;
        System.err.println(q);
        q = transformation(analyzer.tokenStream(PatentDocument.Description, qText[3]), descriptionTreshold,
                PatentDocument.Description);
        queries[3] = q;
        System.err.println(q);
        q = transformation(analyzer.tokenStream(PatentDocument.Description, qText[4]), descriptionTreshold,
                null);
        queries[4] = q;
        System.err.println(q);
        q = transformation(analyzer.tokenStream(PatentDocument.Claims, qText[5]), claimsTreshold,
                PatentDocument.Claims);
        queries[5] = q;
        System.err.println(q);
        q = transformation(analyzer.tokenStream(PatentDocument.Claims, qText[6]), claimsTreshold, null);
        queries[6] = q;
        System.err.println(q);
    }
}

From source file:nicta.com.au.failureanalysis.query.QueryGneration.java

public Map<String, Integer> getSectionTerms(String section) throws IOException {
    String title = "";
    String ipc = "";
    String abstrac = "";
    String description = "";
    String descriptionP5 = "";
    String claims = "";
    String claims1 = "";

    //********************************************************************
    // leveraging Title
    //********************************************************************
    for (InventionTitle inventionTitle : pt.getTechnicalData().getInventionTitle()) {
        if (inventionTitle.getLang().toLowerCase().equals("en")) {
            title = inventionTitle.getContent();
        }/*from w  w w. j av  a  2 s  . co  m*/
    }
    //********************************************************************
    // leveraging IPC Codes
    //********************************************************************
    Set<String> codes = new HashSet<>();
    for (ClassificationIpcr ipcCode : pt.getTechnicalData().getClassificationIpcr()) {
        StringTokenizer st = new StringTokenizer(ipcCode.getContent());
        String p1 = st.nextToken();
        String p2 = st.nextToken();
        codes.add(p1);
        fullClassCodes.add(p1 + p2);
    }
    for (String code : codes) {
        if (!ipc.contains(code)) {
            ipc += code + " ";
        }
    }
    //********************************************************************
    // leveraging Abstract
    //********************************************************************
    if (pt.getAbstrac().getLang() != null) {
        if (pt.getAbstrac().getLang().toLowerCase().equals("en")) {
            abstrac = pt.getAbstrac().getContent();
        }
    }
    //********************************************************************
    // leveraging Description
    //********************************************************************
    if (pt.getDescription() != null) {
        if (pt.getDescription().getLang().toLowerCase().equals("en")) {
            for (P p : pt.getDescription().getP()) {
                if (Integer.parseInt(p.getNum()) == 1 || Integer.parseInt(p.getNum()) == 2
                        || Integer.parseInt(p.getNum()) == 3 || Integer.parseInt(p.getNum()) == 4
                        || Integer.parseInt(p.getNum()) == 5) { // Leveraging first 5 paragraphes
                    descriptionP5 += p.getContent() + " ";
                }
                description += p.getContent() + " ";
            }
        }
    }
    //********************************************************************
    // leveraging Claims
    //********************************************************************
    for (Claims cs : pt.getClaims()) {
        if (cs.getLang().toLowerCase().equals("en")) {
            for (Claim claim : cs.getClaim()) {
                if (Integer.parseInt(claim.getNum()) == 1) {// Leveraging Claims 1
                    claims1 += claim.getClaimText() + " ";
                }
                claims += claim.getClaimText() + " ";
            }
        }
    }
    //********************************************************************
    this.queries[0] = ipc;
    this.queries[1] = title;
    this.queries[2] = abstrac;
    this.queries[3] = description;
    this.queries[4] = descriptionP5;
    this.queries[5] = claims;
    this.queries[6] = claims1;
    Map<String, Analyzer> analyzerPerField = new HashMap<>();
    if (stopWords) {
        /*analyzerPerField.put(PatentDocument.Title, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.TITLE_ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Abstract, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ABSTRACT_ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Description, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.DESCRIPTION_ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Claims, new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.CLAIMS_ENGLISH_STOP_WORDS_SET));*/

        analyzerPerField.put(PatentDocument.Title, new EnglishAnalyzer(Version.LUCENE_48,
                PatentsStopWords.UNIFIED_PATENT__ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Abstract, new EnglishAnalyzer(Version.LUCENE_48,
                PatentsStopWords.UNIFIED_PATENT__ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Description, new EnglishAnalyzer(Version.LUCENE_48,
                PatentsStopWords.UNIFIED_PATENT__ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Claims, new EnglishAnalyzer(Version.LUCENE_48,
                PatentsStopWords.UNIFIED_PATENT__ENGLISH_STOP_WORDS_SET));
    } else {
        analyzerPerField.put(PatentDocument.Title,
                new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Abstract,
                new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Description,
                new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Claims,
                new EnglishAnalyzer(Version.LUCENE_48, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
    }
    analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.LUCENE_48), analyzerPerField);

    boolean oneNumber = false;
    int j = -1;
    for (int i = 1; i < fields.length; i++) {
        float v = boosts.get(fields[i]);
        if (oneNumber == false && v > 0) {
            oneNumber = true;
            j = i;
        } else if (oneNumber && v > 0) {
            oneNumber = false;
            j = -1;
            break;
        }
    }
    /*if (oneNumber) {
    String qText = queries[j];
    String q = transformation(analyzer.tokenStream(fields[j], qText), titleTreshold, PatentDocument.Title);
    //            System.err.println(q);
    queries[1] = q;
    boosts.put(PatentDocument.Title, (float) 1.0);
            
    q = transformation(analyzer.tokenStream(fields[j], qText), abstractTreshold, PatentDocument.Abstract);
    //            System.err.println(q);
    queries[2] = q;
    boosts.put(PatentDocument.Abstract, (float) 1.0);
    q = transformation(analyzer.tokenStream(fields[j], qText), descriptionTreshold, PatentDocument.Description);
    //            System.err.println(q);
    queries[3] = q;
    boosts.put(PatentDocument.Description, (float) 1.0);
    queries[4] = "";
    q = transformation(analyzer.tokenStream(fields[j], qText), claimsTreshold, PatentDocument.Claims);
    //            System.err.println(q);
    queries[5] = q;
    boosts.put(PatentDocument.Claims, (float) 1.0);
    queries[6] = "";
    } else {*/
    String[] qText = queries;
    String q = transformation(analyzer.tokenStream(PatentDocument.Title, qText[1]), titleTreshold,
            PatentDocument.Title);
    Map<String, Integer> t = getTerms(analyzer.tokenStream(PatentDocument.Title, qText[1]), titleTreshold,
            PatentDocument.Title);
    //            System.err.println(q);
    queries[1] = q;
    q = transformation(analyzer.tokenStream(PatentDocument.Abstract, qText[2]), abstractTreshold,
            PatentDocument.Abstract);
    Map<String, Integer> a = getTerms(analyzer.tokenStream(PatentDocument.Abstract, qText[2]), abstractTreshold,
            PatentDocument.Abstract);
    //            System.err.println(q);
    queries[2] = q;
    q = transformation(analyzer.tokenStream(PatentDocument.Description, qText[3]), descriptionTreshold,
            PatentDocument.Description);
    Map<String, Integer> d = getTerms(analyzer.tokenStream(PatentDocument.Description, qText[3]),
            descriptionTreshold, PatentDocument.Description);
    //            System.err.println(q);
    queries[3] = q;
    q = transformation(analyzer.tokenStream(PatentDocument.Description, qText[4]), descriptionTreshold, null);
    queries[4] = q;
    q = transformation(analyzer.tokenStream(PatentDocument.Claims, qText[5]), claimsTreshold,
            PatentDocument.Claims);
    Map<String, Integer> c = getTerms(analyzer.tokenStream(PatentDocument.Claims, qText[5]), claimsTreshold,
            PatentDocument.Claims);
    //            System.err.println(q);
    queries[5] = q;
    q = transformation(analyzer.tokenStream(PatentDocument.Claims, qText[6]), claimsTreshold, null);
    queries[6] = q;

    if (section.equals("title")) {
        //                System.err.println(q);
        return t;
    } else {
        if (section.equals("abstract")) {
            return a;
        } else {
            if (section.equals("description")) {
                return d;
            } else {
                return c;
            }
        }

    }
    //            return t;
    //        }
}

From source file:nicta.com.au.patent.pac.analysis.FieldsCosineSimilarities.java

public final Map<String, Double>[] parse(PatentDocument pt) throws IOException, Exception {
    Map<String, Double>[] out = new Map[5];
    String[] ptFields = new String[5];
    String title = "";
    String ipc = "";
    String abstrac = "";
    String description = "";
    String claims = "";
    for (InventionTitle inventionTitle : pt.getTechnicalData().getInventionTitle()) {
        if (inventionTitle.getLang().toLowerCase().equals("en")) {
            title = inventionTitle.getContent();
        }//from ww  w .  jav a 2s.c  o m
    }
    Map<String, Double> m1 = new HashMap<>();
    for (ClassificationIpcr ipcCode : pt.getTechnicalData().getClassificationIpcr()) {
        StringTokenizer st = new StringTokenizer(ipcCode.getContent());
        m1.put(st.nextToken(), 1.0);
    }

    if (pt.getAbstrac().getLang() != null && pt.getAbstrac().getLang().toLowerCase().equals("en")) {
        abstrac = pt.getAbstrac().getContent();
    }
    if (pt.getDescription() != null && pt.getDescription().getLang().toLowerCase().equals("en")) {
        for (P p : pt.getDescription().getP()) {
            description += p.getContent() + " ";
        }
    }
    for (Claims cs : pt.getClaims()) {
        if (cs.getLang().toLowerCase().equals("en")) {
            for (Claim claim : cs.getClaim()) {
                claims += claim.getClaimText() + " ";
            }
        }
    }
    ptFields[0] = title;
    ptFields[1] = ipc;
    ptFields[2] = abstrac;
    ptFields[3] = description;
    ptFields[4] = claims;
    Map<String, Analyzer> analyzerPerField = new HashMap<>();

    if (specificStopWords == true) {
        analyzerPerField.put(PatentDocument.Title,
                new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.TITLE_ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Abstract,
                new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ABSTRACT_ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Description,
                new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.DESCRIPTION_ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Claims,
                new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.CLAIMS_ENGLISH_STOP_WORDS_SET));
    } else {
        analyzerPerField.put(PatentDocument.Title,
                new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Abstract,
                new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Description,
                new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ENGLISH_STOP_WORDS_SET));
        analyzerPerField.put(PatentDocument.Claims,
                new EnglishAnalyzer(Version.LUCENE_44, PatentsStopWords.ENGLISH_STOP_WORDS_SET));

    }

    PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.LUCENE_44),
            analyzerPerField);
    Map<String, Double> m0 = getVector(analyzer.tokenStream(PatentDocument.Title, ptFields[0]),
            PatentDocument.Title);
    Map<String, Double> m2 = getVector(analyzer.tokenStream(PatentDocument.Abstract, ptFields[2]),
            PatentDocument.Abstract);
    Map<String, Double> m3 = getVector(analyzer.tokenStream(PatentDocument.Description, ptFields[3]),
            PatentDocument.Description);
    Map<String, Double> m4 = getVector(analyzer.tokenStream(PatentDocument.Claims, ptFields[4]),
            PatentDocument.Claims);
    out[0] = m0;
    out[1] = m1;
    out[2] = m2;
    out[3] = m3;
    out[4] = m4;
    return out;
}