Example usage for org.apache.lucene.analysis TokenStream getAttribute

List of usage examples for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass) 

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:cl.usach.escalemania.sessionbeans.DocumentoFacade.java

public List<String> tokenizeString(Analyzer analyzer, String tweet) {

    List<String> result = new ArrayList<String>();
    try {/*from   ww  w.j  a v  a  2s  . c om*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(tweet));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return result;
}

From source file:cn.com.szgao.enterprise.ExecutorsIndustry.java

@SuppressWarnings("rawtypes")
 private void readFileByLines(File file, int startNum) throws Exception {

     // ??//from  w ww.  j a v  a  2 s  .  co  m
     int basicSum = 0;
     // ??
     int regSum = 0;

     // ???
     String batchNum = file.getName();
     // 
     StringBuffer sb = new StringBuffer();
     // FileReader fr = new FileReader(file);
     // int ch = 0;
     // while((ch = fr.read())!=-1 )
     // {
     // sb.append((char)ch);
     // }
     // fr.close();
     // fr = null;

     // BufferedWriter fw = null;
     String encoding_from = "UTF-8";// GB18030
     // String encoding_to = "UTF-8";
     BufferedReader reader = null;
     try {
         // ? GB18030

         // InputStreamReader isr = new InputStreamReader(new
         // FileInputStream(file), "UTF-8");
         // InputStreamReader isr = new InputStreamReader(new
         // FileInputStream(file), "GBK");
         InputStreamReader isr = new InputStreamReader(new FileInputStream(file), encoding_from);
         reader = new BufferedReader(isr);

     } catch (FileNotFoundException e1) {
         e1.printStackTrace();
     }

     String tempT = null;

     int readNum = 0;

     // E:\\data\\--0008.txt
     String ss = file.getPath().substring(file.getPath().indexOf("data") + 4);
     String ss2 = file.getPath().substring(file.getPath().indexOf("data") + 5, file.getPath().lastIndexOf("\\"));

     //      String folderPath = "D:/lm/log/?/" + ss2;
     String filePath = file.getPath().replace("???", "???industryId")
             .replace("E:", "D:");

     // 
     //      FileUtils.newFolder(folderPath);
     File fileS = new File(filePath);
     String encoding_from1 = "UTF-8";
     BufferedWriter fw = null;
     try {
         if (!fileS.exists()) {
             try {
                 fileS.createNewFile();
             } catch (IOException e) {
                 e.printStackTrace();
                 log.error(e);
             }
             fw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileS, true), encoding_from1)); // ????
         } else {
             fileS.delete();
             fileS = new File(filePath);
             fw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileS, true), encoding_from1)); // ????
         }
     } catch (FileNotFoundException e1) {
         e1.printStackTrace();
     }

     int M = 1;// 1M?
     while ((tempT = reader.readLine()) != null) {

         // tempT="";

         long size = tempT.getBytes().length;
         if (size > M * 1024 * 1024) {
             log.error("--------?1M :" + tempT.substring(0, 500));
             continue;
         }

         /**
          * 
          */
         JSONArray holderArray = null;
         /**
          * 
          */
         JSONArray holderDetArray = null;
         List<String> beforeList = new ArrayList<String>();
         List<String> afterList = new ArrayList<String>();

         EnterpriseVO enterVO = new EnterpriseVO();
         EnterpriseVO enterVOT = new EnterpriseVO();

         readNum++;

         // if(readNum<34431){
         // continue;
         // }
         // if(readNum==34431){
         //
         // writerString(fwUn, tempT);
         // break;
         // }

         if (readNum < startNum) {
             continue;
         }
         System.out.println("-->>>>>>> " + (readNum) + "---??" + Thread.currentThread().getName() + "---"
                 + file.getPath());
         if (tempT == null || tempT == "") {
             continue;
         }

         String doString = tempT.toString();
         JSONObject obj = null;
         // System.out.println("in..." + doString);
         try {
             enterVO = gs.fromJson(tempT, EnterpriseVO.class);
         } catch (Exception e) {
             e.printStackTrace();
             log.error(e);
             continue;
             // return;
         }

         String industry = null;
         // ?
         Analyzer anal = new IKAnalyzer();
         String indu = removeBlank(enterVO.getScope());
         IndustryVO ivo = null;
         IndustryVO ivo1 = null;
         if (StringUtils.isNull(industry)) {
             if (!StringUtils.isNull(indu)) {
                 String[] sourceStrArray = indu.split("[;.:,]");// ?
                 for (String str : sourceStrArray) {
                     // System.out.println("-- " + str);

                     ivo = getIndustry(str);
                     if (!StringUtils.isNull(ivo.getIndustry_id())) {
                         break;
                     }

                     // ik?
                     if (null == ivo.getIndustry_id()) {

                         StringReader reader1 = new StringReader(str);
                         // ?
                         TokenStream ts = null;
                         ts = anal.tokenStream("", reader1);
                         CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
                         try {
                             ts.reset();

                             String strs = "";
                             // ????
                             while (ts.incrementToken()) {
                                 String temp = term.toString();
                                 if (!StringUtils.isNull(temp)) {
                                     // getIndustry(temp);
                                     strs += term.toString() + "|";
                                     // System.out.print(term.toString() + "|" );
                                 }
                             }
                             reader1.close();

                             String[] arrStr1 = strs.split("\\|");
                             StringUtils.sortStringArray(arrStr1, 1);// ?
                             List<IndustryVO> listiv = new ArrayList<IndustryVO>();
                             List<IndustryVO> listiv_v = new ArrayList<IndustryVO>();
                             List<IndustryVO> listiv_n = new ArrayList<IndustryVO>();

                             for (int i = 0; i < arrStr1.length; i++) {
                                 String temp = arrStr1[i];
                                 if (!StringUtils.isNull(temp)) {
                                     ivo1 = getIndustry(temp);
                                     if (!StringUtils.isNull(ivo1.getIndustry_id())) {
                                         listiv.add(ivo1);
                                         // break;
                                     }
                                 }
                             }
                             // ?????
                             if (listiv.size() > 0) {
                                 for (IndustryVO industryVO : listiv) {
                                     if ("V".equals(industryVO.getFlag())) {
                                         listiv_v.add(industryVO);
                                     } else {
                                         listiv_n.add(industryVO);
                                     }
                                 }
                             }
                             if (listiv_v.size() > 0) {
                                 ivo = getfirstSortStringArray(listiv_v, 1);
                                 break;
                             }
                             if (listiv_n.size() > 0) {
                                 ivo = getfirstSortStringArray(listiv_n, 1);
                                 break;
                             }

                         } catch (IOException e) {
                             e.printStackTrace();
                         }
                     }

                     if (!StringUtils.isNull(ivo.getIndustry_id())) {
                         break;
                     }
                 }
             }
         }
         if (null != ivo) {
             enterVO.setIndustry(ivo.getIndustry_name());
             enterVO.setIndustryId(ivo.getIndustry_id());
         }
         // System.out.println("in..." + obj);

         writerString(fw, StringUtils.GSON.toJson(enterVO));
     }
     log.info("?regSum: " + regSum + " ?basicSum: " + basicSum + " readNum: "
             + readNum + " -: " + (readNum - basicSum - regSum) + "---??" + file.getPath());
 }

From source file:cn.com.szgao.enterprise.ExecutorsIndustry.java

/**
  * //from w  w  w . j a  v  a  2 s .  c o  m
  * 
  * @param enterVO
  * @return
  */
 public static EnterpriseVO washEnterpriseVO(EnterpriseVO enterVO) throws IOException {

     if (!StringUtils.isNull(enterVO.getCompany())) {
         enterVO.setCompany(deleteMoreFuhao(enterVO.getCompany()));
     }

     // ??
     String scope = enterVO.getScope();
     if (!StringUtils.isNull(scope)) {
         scope = scope.replaceAll("[&nbsp;\r\t^]", "");
         if (scope.length() > 5) {
             String temp = "";
             // System.out.println(scope);
             // if (scope.substring(scope.length() - 5).contains("^")) {
             // temp = scope.substring(scope.length() - 5).replace("^", "");
             // scope = scope.substring(0, scope.length() - 5) + temp;
             // } else {
             temp = scope.substring(scope.length() - 5).replaceAll("[#*+]", "");
             scope = scope.substring(0, scope.length() - 5) + temp;
             // }
         }
         enterVO.setScope(scope);
     }
     String regCapital = enterVO.getRegCapital();
     // ? &nbsp
     if (!StringUtils.isNull(regCapital)) {

         String capital = enterVO.getRegCapital().replaceAll("[&nbsp;\r\t\n]", "");
         // enterVO.setRegCapital(capital);

         enterVO.setRegCapitalO(capital);

         Double regCapitalN = getBankingDouble(capital);
         if (null != regCapitalN) {
             enterVO.setRegCapitalN(regCapitalN);
             enterVO.setRegCapital(null);

             if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("?");
             } else if (regCapital.contains("?")) {
                 enterVO.setUnit("??");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("?")) {
                 enterVO.setUnit("?");
             } else if (regCapital.contains("?")) {
                 enterVO.setUnit("?");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("?")) {
                 enterVO.setUnit("?");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             }

             else {
                 enterVO.setUnit("?");
             }
         }

     }

     // ?
     String industry = null;
     String industryId = null;

     /*
      * if (!StringUtils.isNull(enterVO.getCompany())) { String key =
      * StringUtils.NBG.generate(enterVO.getCompany()).toString();
      * JsonDocument queryDoc = ExecutorsText.bucket.get(key, 60,
      * TimeUnit.MINUTES);
      * 
      * if (null == queryDoc) { // System.out.println(
      * "NULL queryDoc-------------------- " + // key); //
      * System.out.println("NULL queryDoc-------------------- " + // key);
      * 
      * } else { BusinessDirectoryVO vo =
      * StringUtils.GSON.fromJson(queryDoc.content().toString(),
      * BusinessDirectoryVO.class); if
      * (!StringUtils.isNull(vo.getIndustry())) { industry =
      * vo.getIndustry(); } } }
      */

     // ?
     Analyzer anal = new IKAnalyzer();
     String indu = removeBlank(enterVO.getScope());
     IndustryVO ivo = null;
     IndustryVO ivo1 = null;
     if (StringUtils.isNull(industry)) {
         if (!StringUtils.isNull(indu)) {
             String[] sourceStrArray = indu.split("[;.:,]");// ?
             for (String str : sourceStrArray) {
                 // System.out.println("-- " + str);

                 ivo = getIndustry(str);
                 if (!StringUtils.isNull(ivo.getIndustry_id())) {
                     break;
                 }

                 // ik?
                 if (null == ivo.getIndustry_id()) {

                     StringReader reader = new StringReader(str);
                     // ?
                     TokenStream ts = null;
                     ts = anal.tokenStream("", reader);
                     CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
                     try {
                         ts.reset();

                         String strs = "";
                         // ????
                         while (ts.incrementToken()) {
                             String temp = term.toString();
                             if (!StringUtils.isNull(temp)) {
                                 // getIndustry(temp);
                                 strs += term.toString() + "|";
                                 // System.out.print(term.toString() + "|" );
                             }
                         }
                         reader.close();

                         String[] arrStr1 = strs.split("\\|");
                         StringUtils.sortStringArray(arrStr1, 1);// ?
                         List<IndustryVO> listiv = new ArrayList<IndustryVO>();
                         List<IndustryVO> listiv_v = new ArrayList<IndustryVO>();
                         List<IndustryVO> listiv_n = new ArrayList<IndustryVO>();

                         for (int i = 0; i < arrStr1.length; i++) {
                             String temp = arrStr1[i];
                             if (!StringUtils.isNull(temp)) {
                                 ivo1 = getIndustry(temp);
                                 if (!StringUtils.isNull(ivo1.getIndustry_id())) {
                                     listiv.add(ivo1);
                                     // break;
                                 }
                             }
                         }
                         // ?????
                         if (listiv.size() > 0) {
                             for (IndustryVO industryVO : listiv) {
                                 if ("V".equals(industryVO.getFlag())) {
                                     listiv_v.add(industryVO);
                                 } else {
                                     listiv_n.add(industryVO);
                                 }
                             }
                         }
                         if (listiv_v.size() > 0) {
                             ivo = getfirstSortStringArray(listiv_v, 1);
                             break;
                         }
                         if (listiv_n.size() > 0) {
                             ivo = getfirstSortStringArray(listiv_n, 1);
                             break;
                         }

                     } catch (IOException e) {
                         e.printStackTrace();
                     }
                 }

                 if (!StringUtils.isNull(ivo.getIndustry_id())) {
                     break;
                 }
             }
         }
     }
     if (null != ivo) {
         enterVO.setIndustry(ivo.getIndustry_name());
         enterVO.setIndustryId(ivo.getIndustry_id());
     }

     if (!StringUtils.isNull(enterVO.getRegNum())) {

         if (enterVO.getRegNum().indexOf("\u0000") != -1) {
             enterVO.setRegNum(enterVO.getRegNum().substring(0, enterVO.getRegNum().indexOf("\u0000")));
         }
     }

     enterVO.setHolder(WashEtp.clearHolder(enterVO));// 
     enterVO.setPunishment(WashEtp.clearPunishment(enterVO));// ?
     enterVO.setChange(WashEtp.clearChangeItem(enterVO));// ?
     enterVO.setReport(WashEtp.clearReport(enterVO));// 

     return enterVO;
 }

From source file:cn.com.szgao.enterprise.FileIntoDataBase2p5.java

/**
  * ik??/*  w w  w . ja v  a 2  s . c om*/
  * 
  * @param str
  * @return
  * @return IndustryVO
  * @author liuming
  * @date 2016629 ?2:36:09
  */
 public static IndustryVO getIndustryVOFromIk(String str) {
     // ?
     Analyzer anal = new IKAnalyzer();
     IndustryVO ivo = new IndustryVO();
     IndustryVO ivo1 = new IndustryVO();

     StringReader reader = new StringReader(str);
     // ?
     TokenStream ts = null;
     ts = anal.tokenStream("", reader);
     CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
     try {
         ts.reset();

         String strs = "";
         // ????
         while (ts.incrementToken()) {
             String temp = term.toString();
             if (!StringUtils.isNull(temp)) {
                 // getIndustry(temp);
                 strs += term.toString() + "|";
                 // System.out.print(term.toString() + "|" );
             }
         }

         reader.close();

         String[] arrStr1 = strs.split("\\|");
         StringUtils.sortStringArray(arrStr1, 1);// ?
         List<IndustryVO> listiv = new ArrayList<IndustryVO>();
         List<IndustryVO> listiv_v = new ArrayList<IndustryVO>();
         List<IndustryVO> listiv_n = new ArrayList<IndustryVO>();

         for (int i = 0; i < arrStr1.length; i++) {
             String temp = arrStr1[i];
             if (!StringUtils.isNull(temp)) {
                 ivo1 = getIndustry(temp);
                 if (!StringUtils.isNull(ivo1.getIndustry_id())) {
                     listiv.add(ivo1);
                     // break;
                 }
             }
         }
         // ?????
         // if (listiv.size() > 0) {
         // for (IndustryVO industryVO : listiv) {
         // if ("V".equals(industryVO.getFlag())) {
         // listiv_v.add(industryVO);
         // } else {
         // listiv_n.add(industryVO);
         // }
         // }
         // }

         // ???
         if (listiv.size() > 0) {
             ivo = getfirstSortStringArray(listiv, 1);
         }

         // if (listiv_v.size() > 0) {
         // ivo = getfirstSortStringArray(listiv_v, 1);
         // }
         // if (listiv_n.size() > 0) {
         // ivo = getfirstSortStringArray(listiv_n, 1);
         // }

     } catch (IOException e) {
         e.printStackTrace();
     }
     return ivo;
 }

From source file:cn.com.szgao.enterprise.FileIntoDataBase2p5WashJson.java

/**
  * ik??//w w  w  .ja  v  a 2 s .c  o m
  * 
  * @param str
  * @return
  * @return IndustryVO
  * @author liuming
  * @date 2016629 ?2:36:09
  */
 public static IndustryVO getIndustryVOFromIk(String str) {
     // ?
     Analyzer anal = new IKAnalyzer();
     IndustryVO ivo = new IndustryVO();
     IndustryVO ivo1 = new IndustryVO();

     StringReader reader = new StringReader(str);
     // ?
     TokenStream ts = null;
     ts = anal.tokenStream("", reader);
     CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
     try {
         ts.reset();

         String strs = "";
         // ????
         while (ts.incrementToken()) {
             String temp = term.toString();
             if (!StringUtils.isNull(temp)) {
                 // getIndustry(temp);
                 strs += term.toString() + "|";
                 // System.out.print(term.toString() + "|" );
             }
         }
         reader.close();

         String[] arrStr1 = strs.split("\\|");
         StringUtils.sortStringArray(arrStr1, 1);// ?
         List<IndustryVO> listiv = new ArrayList<IndustryVO>();
         List<IndustryVO> listiv_v = new ArrayList<IndustryVO>();
         List<IndustryVO> listiv_n = new ArrayList<IndustryVO>();

         for (int i = 0; i < arrStr1.length; i++) {
             String temp = arrStr1[i];
             if (!StringUtils.isNull(temp)) {
                 ivo1 = getIndustry(temp);
                 if (!StringUtils.isNull(ivo1.getIndustry_id())) {
                     listiv.add(ivo1);
                     // break;
                 }
             }
         }
         // ?????
         // if (listiv.size() > 0) {
         // for (IndustryVO industryVO : listiv) {
         // if ("V".equals(industryVO.getFlag())) {
         // listiv_v.add(industryVO);
         // } else {
         // listiv_n.add(industryVO);
         // }
         // }
         // }

         // ???
         if (listiv.size() > 0) {
             ivo = getfirstSortStringArray(listiv, 1);
         }

         // if (listiv_v.size() > 0) {
         // ivo = getfirstSortStringArray(listiv_v, 1);
         // }
         // if (listiv_n.size() > 0) {
         // ivo = getfirstSortStringArray(listiv_n, 1);
         // }

     } catch (IOException e) {
         e.printStackTrace();
     }
     return ivo;
 }

From source file:cn.jcenterhome.web.action.CpAction.java

private List<String> getKeyWord(String text) throws IOException {
    List<String> keywords = new ArrayList<String>();
    if (!Common.empty(text)) {
        Map<String, Integer> words = new HashMap<String, Integer>();
        Analyzer analyzer = new IKAnalyzer(true);
        StringReader reader = new StringReader(text);
        TokenStream tokenStream = analyzer.tokenStream("*", reader);
        TermAttribute termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
        while (tokenStream.incrementToken()) {
            String word = termAtt.term();
            if (word.length() > 1 && Common.strlen(word) > 2) {
                Integer count = words.get(word);
                if (count == null) {
                    count = 0;/*from   www . jav a 2s.c om*/
                }
                words.put(word, count + 1);
            }
        }
        if (words.size() > 0) {
            Directory dir = null;
            IndexSearcher searcher = null;
            try {
                String fieldName = "text";
                dir = new RAMDirectory();
                IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
                Document doc = new Document();
                doc.add(new Field(fieldName, text, Field.Store.YES, Field.Index.ANALYZED));
                writer.addDocument(doc);
                writer.close();
                searcher = new IndexSearcher(dir);
                searcher.setSimilarity(new IKSimilarity());
                Set<String> keys = words.keySet();
                Map<String, Float> temps = new HashMap<String, Float>();
                for (String key : keys) {
                    int count = words.get(key);
                    Query query = IKQueryParser.parse(fieldName, key);
                    TopDocs topDocs = searcher.search(query, 1);
                    if (topDocs.totalHits > 0) {
                        temps.put(key, topDocs.getMaxScore() * count);
                    }
                }
                Entry<String, Float>[] keywordEntry = getSortedHashtableByValue(temps);
                for (Entry<String, Float> entry : keywordEntry) {
                    if (keywords.size() < 5) {
                        keywords.add(entry.getKey());
                    }
                }
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                try {
                    searcher.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                try {
                    dir.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }
    return keywords;
}

From source file:com.antsdb.saltedfish.sql.vdm.LuceneUtil.java

License:Open Source License

static void tokenize(String text, BiConsumer<String, String> lambda) {
    try (StandardAnalyzer analyzer = new StandardAnalyzer()) {
        TokenStream stream = analyzer.tokenStream("", text);
        CharTermAttribute term = stream.getAttribute(CharTermAttribute.class);
        TypeAttribute type = stream.getAttribute(TypeAttribute.class);
        stream.reset();//from   ww  w .  ja v a  2s . c  o m
        while (stream.incrementToken()) {
            lambda.accept(type.type(), term.toString());
        }
    } catch (IOException x) {
        throw new RuntimeException(x);
    }
}

From source file:com.b2international.index.compat.Highlighting.java

License:Apache License

/**
 * Splits a string to a list of tokens using the specified Lucene analyzer.
 * /*w w  w. j a  va 2  s. co m*/
 * @param analyzer the analyzer determining token boundaries (may not be {@code null})
 * @param s the string to split
 * @return a list of tokens, or an empty list if {@code s} is {@code null} or empty
 */
public static List<String> split(Analyzer analyzer, final String s) {

    checkNotNull(analyzer, "analyzer");

    if (Strings.isNullOrEmpty(s)) {
        return ImmutableList.of();
    }

    final List<String> tokens = Lists.newArrayList();
    TokenStream stream = null;

    try {

        stream = analyzer.tokenStream(null, new StringReader(s));
        stream.reset();

        while (stream.incrementToken()) {
            tokens.add(stream.getAttribute(CharTermAttribute.class).toString());
        }

    } catch (final IOException ignored) {
        // Should not be thrown when using a string reader
    } finally {
        endAndCloseQuietly(stream);
    }

    return tokens;
}

From source file:com.basistech.elasticsearch.index.analysis.rosette.SimpleRosetteAnalysisTests.java

License:Open Source License

public static void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException {
    stream.reset();//www .j  av  a  2s .c  o  m
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    Assert.assertNotNull(termAttr);
    int i = 0;
    while (stream.incrementToken()) {
        String s = termAttr.toString();
        //out.printf("Output Token %2d: %s%n", i, s);
        Assert.assertTrue(i < expected.length, "got extra term: " + s);
        Assert.assertEquals(termAttr.toString(), expected[i], "expected different term at index " + i);
        i++;
    }
    Assert.assertEquals(i, expected.length, "not all tokens produced");
}

From source file:com.basistech.IndexFiles.java

License:Open Source License

private void iterateOverFiles(File directory) throws IOException {
    File[] textFiles = directory.listFiles(new FilenameFilter() {
        public boolean accept(File dir, String name) {
            return name.endsWith(".txt");
        }/*  w  w w . j ava  2s.  c o m*/
    });

    for (File dataFile : textFiles) {
        Reader dataReader = null;
        try {
            dataReader = Files.newReader(dataFile, Charsets.UTF_8);
            TokenStream tokenStream = analyzer.tokenStream("full_text", dataReader);
            tokenStream.reset();
            OffsetAttribute offsets = tokenStream.getAttribute(OffsetAttribute.class);

            while (tokenStream.incrementToken()) {
                offsets.startOffset();
            }
        } finally {
            IOUtils.closeQuietly(dataReader);
        }
    }

}