List of usage examples for org.apache.lucene.analysis.shingle ShingleAnalyzerWrapper tokenStream
public final TokenStream tokenStream(final String fieldName, final Reader reader)
fieldName, tokenizing the contents of reader. From source file:fr.ericlab.sondy.core.DataManipulation.java
License:Open Source License
public void prepareStream(String datasetName, int intervalDuration, int ngram, String stemLanguage, boolean lemmatization, AppVariables appVariables) { try {//from w w w .j a va 2 s .c o m Connection connection; Class.forName("com.mysql.jdbc.Driver").newInstance(); connection = DriverManager.getConnection("jdbc:mysql://" + appVariables.configuration.getHost(), appVariables.configuration.getUsername(), appVariables.configuration.getPassword()); Statement statement = connection.createStatement(); Statement statement2 = connection.createStatement(); String lemStr = (lemmatization) ? "_lem1" : "_lem0"; statement.executeUpdate("CREATE TABLE " + appVariables.configuration.getSchema() + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram + "gram ( id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, msg_author VARCHAR(100), msg_post_time TIMESTAMP, msg_text VARCHAR(600), time_slice INT)ENGINE=myisam;"); // statement.executeUpdate("CREATE INDEX index_time ON "+appVariables.configuration.getSchema()+"."+datasetName+"_messages (msg_post_time)"); ResultSet rsTMin = statement.executeQuery("select min(msg_post_time) from " + appVariables.configuration.getSchema() + "." + datasetName + "_messages;"); rsTMin.next(); Timestamp tMin = rsTMin.getTimestamp(1); ResultSet rsTMax = statement.executeQuery("select max(msg_post_time) from " + appVariables.configuration.getSchema() + "." + datasetName + "_messages;"); rsTMax.next(); Timestamp tMax = rsTMax.getTimestamp(1); Timestamp tRef = new Timestamp(0); long base = (tMin.getTime() - tRef.getTime()) * 1L; long streamDuration = (tMax.getTime() - tMin.getTime()) * 1L; long streamDurationMin = (streamDuration / 1000) / 60; String path = appVariables.configuration.getWorkspace() + "/datasets/" + datasetName + "/" + intervalDuration + "min-" + stemLanguage; path += (lemmatization) ? "-lem1" : "-lem0"; path += "-" + ngram + "gram"; String pathMention = path + "-m"; FSDirectory indexGlobal = FSDirectory.open(new File(path)); FSDirectory indexMention = FSDirectory.open(new File(pathMention)); Analyzer analyzer; Properties props = new Properties(); props.put("annotators", "tokenize,ssplit,parse,lemma"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation annotation; if (stemLanguage.equalsIgnoreCase("Standard")) { analyzer = new StandardAnalyzer(Version.LUCENE_36); } else { Class cl; if (stemLanguage.equals("Chinese")) { analyzer = new SmartChineseAnalyzer(Version.LUCENE_36); } else { String packageName = stemLanguage.substring(0, 2).toLowerCase(); cl = Class .forName("org.apache.lucene.analysis." + packageName + "." + stemLanguage + "Analyzer"); Class[] types = new Class[] { Version.class, Set.class }; Constructor ct = cl.getConstructor(types); analyzer = (Analyzer) ct.newInstance(Version.LUCENE_36, appVariables.currentStopWords.getSet()); } } IndexWriterConfig configGlobal; IndexWriterConfig configMention; ShingleAnalyzerWrapper shingleAnalyzer = null; if (ngram > 1) { shingleAnalyzer = new ShingleAnalyzerWrapper(analyzer, ngram, ngram, " ", false, false); WhitespaceAnalyzer whitespaceAnalyzer = new WhitespaceAnalyzer(Version.LUCENE_36); configGlobal = new IndexWriterConfig(Version.LUCENE_36, whitespaceAnalyzer); configMention = new IndexWriterConfig(Version.LUCENE_36, whitespaceAnalyzer); } else { configGlobal = new IndexWriterConfig(Version.LUCENE_36, analyzer); configMention = new IndexWriterConfig(Version.LUCENE_36, analyzer); } IndexWriter wGlobal = new IndexWriter(indexGlobal, configGlobal); IndexWriter wMention = new IndexWriter(indexMention, configMention); int docId = 0; for (int i = 0; i < streamDurationMin; i += intervalDuration) { statement = connection.createStatement(); long infBound = base + i * 60 * 1000L; long supBound = base + (i + intervalDuration) * 60 * 1000L; Timestamp infTime = new Timestamp(infBound); Timestamp supTime = new Timestamp(supBound); ResultSet rs = statement.executeQuery("SELECT msg_text, msg_post_time, msg_author FROM " + appVariables.configuration.getSchema() + "." + datasetName + "_messages WHERE msg_post_time>'" + infTime + "' AND msg_post_time< '" + supTime + "'"); String globalContent = new String(); String mentionContent = new String(); String timestamps = new String(); NumberFormat formatter = new DecimalFormat("00000000"); int bulk = 0; String bulkString = ""; boolean mention; while (rs.next()) { String message = rs.getString(1).toLowerCase(); mention = message.contains("@"); if (lemmatization) { annotation = new Annotation(message); message = ""; pipeline.annotate(annotation); List<CoreMap> lem = annotation.get(SentencesAnnotation.class); for (CoreMap l : lem) { for (CoreLabel token : l.get(TokensAnnotation.class)) { message += token.get(LemmaAnnotation.class) + " "; } } } if (ngram > 1) { String processedMessage = ""; TokenStream tokenStream = shingleAnalyzer.tokenStream("text", new StringReader(message)); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { String termToken = charTermAttribute.toString(); if (!termToken.contains("_")) { processedMessage += termToken.replace(" ", "=") + " "; } } message = processedMessage; } bulk++; if (bulk < _BULK_SIZE_) { bulkString += " (" + docId + ",'" + rs.getString(2) + "',\"" + message + "\",\"" + rs.getString(3) + "\"),"; } else { bulk = 0; bulkString += " (" + docId + ",'" + rs.getString(2) + "',\"" + message + "\",\"" + rs.getString(3) + "\");"; statement2.executeUpdate("INSERT INTO " + appVariables.configuration.getSchema() + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram + "gram (time_slice,msg_post_time,msg_text,msg_author) VALUES" + bulkString); bulkString = ""; } globalContent += message + "\n"; if (mention) { mentionContent += message + "\n"; } timestamps += rs.getString(2) + "\n"; } if (bulk > 0 && bulkString.length() > 0) { statement2.executeUpdate("INSERT INTO " + appVariables.configuration.getSchema() + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram + "gram (time_slice,msg_post_time,msg_text,msg_author) VALUES" + bulkString.substring(0, bulkString.length() - 1) + ";"); } Document docGlobal = new Document(); docGlobal.add(new Field("content", globalContent, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); docGlobal.add(new Field("id", Integer.toString(docId), Field.Store.YES, Field.Index.NOT_ANALYZED)); wGlobal.addDocument(docGlobal); wGlobal.commit(); Document docMention = new Document(); docMention.add(new Field("content", mentionContent, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); docMention.add(new Field("id", Integer.toString(docId), Field.Store.YES, Field.Index.NOT_ANALYZED)); wMention.addDocument(docMention); wMention.commit(); File textFile = new File(path + "/input/" + formatter.format(docId) + ".text"); FileUtils.writeStringToFile(textFile, globalContent); File timeFile = new File(path + "/input/" + formatter.format(docId) + ".time"); FileUtils.writeStringToFile(timeFile, timestamps); docId++; statement.close(); } statement2.executeUpdate("CREATE INDEX index_time_slice ON " + appVariables.configuration.getSchema() + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram + "gram (time_slice);"); statement2.executeUpdate("CREATE FULLTEXT INDEX index_text ON " + appVariables.configuration.getSchema() + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram + "gram (msg_text);"); statement2.close(); connection.close(); wGlobal.close(); wMention.close(); } catch (IOException ex) { Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex); } catch (SQLException | InstantiationException | IllegalAccessException | ClassNotFoundException ex) { Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex); } catch (NoSuchMethodException | SecurityException | IllegalArgumentException | InvocationTargetException ex) { Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex); } }