LDA主题模型的java代码实现详解大数据

public class LdaGibbsSampling { 
	public static class modelparameters {   
        float alpha = 0.5f; //usual value is 50 / K   
        float beta = 0.1f;//usual value is 0.1   
        int topicNum = 100;   
        int iteration = 100;   
        int saveStep = 10;   
        int beginSaveIters = 50;   
    }   
       
    /**Get parameters from configuring file. If the   
     * configuring file has value in it, use the value.  
     * Else the default value in program will be used  
     * @param ldaparameters  
     * @param parameterFile  
     * @return void  
     */   
    private static void getParametersFromFile(modelparameters ldaparameters,   
            String parameterFile) {   
        // TODO Auto-generated method stub   
        ArrayList<String> paramLines = new ArrayList<String>();   
        paramLines = FileUtil.readList(parameterFile);   
        for(String line : paramLines){   
            String[] lineParts = line.split("/t");   
            switch(parameters.valueOf(lineParts[0])){   
            case alpha:   
                ldaparameters.alpha = Float.valueOf(lineParts[1]);   
                break;   
            case beta:   
                ldaparameters.beta = Float.valueOf(lineParts[1]);   
                break;   
            case topicNum:   
                ldaparameters.topicNum = Integer.valueOf(lineParts[1]);   
                break;   
            case iteration:   
                ldaparameters.iteration = Integer.valueOf(lineParts[1]);   
                break;   
            case saveStep:   
                ldaparameters.saveStep = Integer.valueOf(lineParts[1]);   
                break;   
            case beginSaveIters:   
                ldaparameters.beginSaveIters = Integer.valueOf(lineParts[1]);   
                break;   
            }   
        }   
    }   
       
    public enum parameters{   
        alpha, beta, topicNum, iteration, saveStep, beginSaveIters;   
    }  
     
    /** 
     * 训练LDA主题模型，对给定的测试样本集进行主题预测，找出每个样本的最大概率主题下的前20个词的集合，作为该测试样本集的主题代表关键词集合 
     * @param trainPathDir 
     * @param parameterFile 
     * @param resultPath 
     * @param testPath 
     * @return 
     * @throws IOException 
     */ 
    public Set<Word> trainAndPredictLDA(String trainPathDir,String parameterFile,String resultPath,String testPath) throws IOException{ 
    	 modelparameters ldaparameters = new modelparameters();   
         getParametersFromFile(ldaparameters, parameterFile);  
         Documents docSet = new Documents();   
         docSet.readDocs(trainPathDir); 
         System.out.println("wordMap size " + docSet.termToIndexMap.size());   
         FileUtil.mkdir(resultPath); 
         LdaModel model = new LdaModel(ldaparameters);   
         System.out.println("1 Initialize the model ...");   
         model.initializeModel(docSet);   
         System.out.println("2 Learning and Saving the model ...");   
         model.inferenceModel(docSet);   
         System.out.println("3 Output the final model ...");   
//         model.saveIteratedModel(ldaparameters.iteration, docSet);   
//         System.out.println("Done!");  
          
         //预测新文本 
         Documents testDocs = new Documents(); 
         List<Message> messages = FileUtil.readMessageFromFile(testPath); 
         Set<Integer> topicIndexSet = new HashSet<Integer> (); 
         for(Message message : messages){ 
        	 String content = message.getContent(); 
        	 Document doc = new Document(content); 
        	 testDocs.docs.add(doc); 
             topicIndexSet.add(model.predictNewSampleTopic(doc)); 
         } 
         /** 
          * 预测每条短信，得到每条的最大概率主题，最后找到每个最大概率主题的前20个词，集合,计算tf-idf 
          */ 
         Set<Word> wordSet = model.getWordByTopics(topicIndexSet, 20); 
         LDAFeatureProcess.calTFIDFAsWeight(docSet, wordSet); 
         return wordSet; 
    } 
    @Test 
    public void test() throws IOException{ 
    	String resultPath = "ldaResult/";   
        String parameterFile= "source/lda_parameters.txt"; 
        String trainPathDir = "LDATrain/"; 
        String testPath = "train/train_messages.txt"; 
        Set<Word> wordSet = trainAndPredictLDA(trainPathDir,parameterFile,resultPath,testPath); 
        FileUtil.writeKeyWordFile("ldaWords/keyWords.doc", new ArrayList<Word>(wordSet)); 
    } 
 
       
    /**  
     * @param args  
     * @throws IOException   
     */   
    public static void main(String[] args) throws IOException {   
        // TODO Auto-generated method stub           
        String resultPath = "ldaResult/";   
        String parameterFile= "source/lda_parameters.txt";   
           
        modelparameters ldaparameters = new modelparameters();   
        getParametersFromFile(ldaparameters, parameterFile);  
        String dirPath = "LDATrain/"; 
        Documents docSet = new Documents();   
        docSet.readDocs(dirPath); 
        System.out.println("wordMap size " + docSet.termToIndexMap.size());   
        FileUtil.mkdir(resultPath); 
        LdaModel model = new LdaModel(ldaparameters);   
        System.out.println("1 Initialize the model ...");   
        model.initializeModel(docSet);   
        System.out.println("2 Learning and Saving the model ...");   
        model.inferenceModel(docSet);   
        System.out.println("3 Output the final model ...");   
        model.saveIteratedModel(ldaparameters.iteration, docSet);   
        System.out.println("Done!");   
         
        //预测新文本 
        String messStr = "好消息！！薇町婚纱造型推出老带新活动啦！已在本店预定的新娘推荐新顾客来本店，定单后即赠送新、老顾客各一支价值58元定妆隔离水（在婚礼当"; 
        Document doc = new Document(messStr); 
        int topicIndex = model.predictNewSampleTopic(doc); 
        Set<Word> wordSet  = model.getWordByTopic(topicIndex);         
        FileUtil.writeKeyWordFile("ldaWords/comparedkeyWords.doc", new ArrayList<Word>(wordSet));         
    }   
 
}

public class LdaModel { 
int [][] doc;//word index array   
int V, K, M;//vocabulary size, topic number, document number   
int [][] z;//topic label array   
float alpha; //doc-topic dirichlet prior parameter    
float beta; //topic-word dirichlet prior parameter   
int [][] nmk;//given document m, count times of topic k. M*K   
int [][] nkt;//given topic k, count times of term t. K*V   
int [] nmkSum;//Sum for each row in nmk   
int [] nktSum;//Sum for each row in nkt   
double [][] phi;//Parameters for topic-word distribution K*V   
double [][] theta;//Parameters for doc-topic distribution M*K   
int iterations;//Times of iterations   
int saveStep;//The number of iterations between two saving   
int beginSaveIters;//Begin save model at this iteration   
Map<String, Integer> wordIndexMap; 
Documents docSet; 
public LdaModel(LdaGibbsSampling.modelparameters modelparam) {   
// TODO Auto-generated constructor stub   
alpha = modelparam.alpha;   
beta = modelparam.beta;   
iterations = modelparam.iteration;   
K = modelparam.topicNum;   
saveStep = modelparam.saveStep;   
beginSaveIters = modelparam.beginSaveIters;   
}   
public void initializeModel(Documents docSet) {  
this.docSet = docSet; 
// TODO Auto-generated method stub   
M = docSet.docs.size();   
V = docSet.termToIndexMap.size();   
nmk = new int [M][K];   
nkt = new int[K][V];   
nmkSum = new int[M];   
nktSum = new int[K];   
phi = new double[K][V];   
theta = new double[M][K];   
this.wordIndexMap = new HashMap<String, Integer> (); 
//initialize documents index array   
doc = new int[M][];   
for(int m = 0; m < M; m++){   
//Notice the limit of memory   
int N = docSet.docs.get(m).docWords.length;   
doc[m] = new int[N];   
for(int n = 0; n < N; n++){   
doc[m][n] = docSet.docs.get(m).docWords[n];   
}   
}   
//initialize topic lable z for each word   
z = new int[M][];   
for(int m = 0; m < M; m++){   
int N = docSet.docs.get(m).docWords.length;   
z[m] = new int[N];   
for(int n = 0; n < N; n++){   
//随机初始化！ 
int initTopic = (int)(Math.random() * K);// From 0 to K - 1   
z[m][n] = initTopic;   
//number of words in doc m assigned to topic initTopic add 1   
nmk[m][initTopic]++;   
//number of terms doc[m][n] assigned to topic initTopic add 1   
nkt[initTopic][doc[m][n]]++;   
// total number of words assigned to topic initTopic add 1   
nktSum[initTopic]++;   
}   
// total number of words in document m is N   
nmkSum[m] = N;   
}   
}   
public void inferenceModel(Documents docSet) throws IOException {   
// TODO Auto-generated method stub   
if(iterations < saveStep + beginSaveIters){   
System.err.println("Error: the number of iterations should be larger than " + (saveStep + beginSaveIters));   
System.exit(0);   
}   
for(int i = 0; i < iterations; i++){   
System.out.println("Iteration " + i);   
if((i >= beginSaveIters) && (((i - beginSaveIters) % saveStep) == 0)){   
//Saving the model   
System.out.println("Saving model at iteration " + i +" ... ");   
//Firstly update parameters   
updateEstimatedParameters();   
//Secondly print model variables   
saveIteratedModel(i, docSet);   
}   
//Use Gibbs Sampling to update z[][]   
for(int m = 0; m < M; m++){   
int N = docSet.docs.get(m).docWords.length;   
for(int n = 0; n < N; n++){   
// Sample from p(z_i|z_-i, w)   
int newTopic = sampleTopicZ(m, n);   
z[m][n] = newTopic;   
}   
}   
}   
}   
private void updateEstimatedParameters() {   
// TODO Auto-generated method stub   
for(int k = 0; k < K; k++){   
for(int t = 0; t < V; t++){   
phi[k][t] = (nkt[k][t] + beta) / (nktSum[k] + V * beta);   
}   
}   
for(int m = 0; m < M; m++){   
for(int k = 0; k < K; k++){   
theta[m][k] = (nmk[m][k] + alpha) / (nmkSum[m] + K * alpha);   
}   
}   
}   
private int sampleTopicZ(int m, int n) {   
// TODO Auto-generated method stub   
// Sample from p(z_i|z_-i, w) using Gibbs upde rule   
//Remove topic label for w_{m,n}   
int oldTopic = z[m][n];   
nmk[m][oldTopic]--;   
nkt[oldTopic][doc[m][n]]--;   
nmkSum[m]--;   
nktSum[oldTopic]--;   
//Compute p(z_i = k|z_-i, w)   
double [] p = new double[K];   
for(int k = 0; k < K; k++){   
p[k] = (nkt[k][doc[m][n]] + beta) / (nktSum[k] + V * beta) * (nmk[m][k] + alpha) / (nmkSum[m] + K * alpha);   
}   
//Sample a new topic label for w_{m, n} like roulette   
//Compute cumulated probability for p   
for(int k = 1; k < K; k++){   
p[k] += p[k - 1];   
}   
double u = Math.random() * p[K - 1]; //p[] is unnormalised   
int newTopic;   
for(newTopic = 0; newTopic < K; newTopic++){   
if(u < p[newTopic]){   
break;   
}   
}   
//Add new topic label for w_{m, n}   
nmk[m][newTopic]++;   
nkt[newTopic][doc[m][n]]++;   
nmkSum[m]++;   
nktSum[newTopic]++;   
return newTopic;   
}  
/** 
* 对给定的待预测的文本，将其分词结果的单词与训练集的单词的索引对应上 
* @param predictWordSet 
* @return 
*/ 
public Map<String,String> matchTermIndex(Set<Word> predictWordSet){ 
/** 
* key:word的内容 value：文档index-单词index，如“1-2” 
*/ 
Map<String,String> wordIndexMap = new HashMap<String, String> (); 
for(Word word : predictWordSet){ 
String content = word.getContent(); 
String indexStr = getTermIndex(content); 
wordIndexMap.put(content, indexStr); 
} 
return wordIndexMap; 
} 
/** 
* 对于给定单词，找到该单词在训练集中对应的文档和单词索引 
* @param content 
* @return 
*/ 
public String getTermIndex(String content){ 
for(Integer m : docSet.getDocWordsList().keySet()){ 
LinkedList<String> list = docSet.getDocWordsList().get(m); 
for(int i = 0; i < list.size(); i ++){ 
if(list.get(i).equals(content)) 
return m+"-"+i; 
} 
} 
return "none"; 
} 
/** 
* 在训练完LDA模型后，根据给定的主题索引set，得到每个主题的topNum单词列表集合 
* @param topicIndexSet 
* @param topNum 
* @return 
*/ 
public Set<Word> getWordByTopics(Set<Integer> topicIndexSet, int topNum){ 
Set<Word> wordSet = new HashSet<Word> (); 
for(Integer indexT : topicIndexSet){ 
List<Integer> tWordsIndexArray = new ArrayList<Integer>();    
for(int j = 0; j < V; j++)  
tWordsIndexArray.add(new Integer(j));             
Collections.sort(tWordsIndexArray, new LdaModel.TwordsComparable(phi[indexT])); 
for(int t = 0; t < topNum; t++){ 
String content = docSet.indexToTermMap.get(tWordsIndexArray.get(t)); 
Word word = new Word(content); 
if(SegmentWordsResult.getStopWordsSet().contains(content)|| 
ProcessKeyWords.remove(word) || ProcessKeyWords.isMeaninglessWord(content)) 
continue; 
wordSet.add(word); 
} 
} 
return wordSet; 
} 
public Set<Word> getWordByTopic(Integer topicIndex){ 
Set<Word> wordSet = new HashSet<Word> (); 
List<Integer> tWordsIndexArray = new ArrayList<Integer>();    
for(int j = 0; j < V; j++){   
tWordsIndexArray.add(new Integer(j));   
}   
Collections.sort(tWordsIndexArray, new LdaModel.TwordsComparable(phi[topicIndex]));     
for(int t = 0; t < V; t++){   
String content = docSet.indexToTermMap.get(tWordsIndexArray.get(t)); 
Word word = new Word(content); 
word.setWeight(phi[topicIndex][tWordsIndexArray.get(t)]); 
if(SegmentWordsResult.getStopWordsSet().contains(content)|| 
ProcessKeyWords.remove(word) || ProcessKeyWords.isMeaninglessWord(content)) 
continue; 
if(phi[topicIndex][tWordsIndexArray.get(t)] <= 0.0) 
continue; 
wordSet.add(word); 
}               	 
return wordSet; 
} 
public int predictNewSampleTopic(Document doc){ 
double topicProb[] = new double[K]; 
Map<String,String> wordIndexMap = matchTermIndex(doc.getWordMap().keySet());  
int predict_v = doc.getWordCount(); 
int [][] predict_nkt;//given topic k, count times of term t. K*V  
double [][] predict_phi;//Parameters for topic-word distribution K*V 
int [] predict_z;//topic label array 
int [] predict_nk;//该文档覆盖的主题索引，值为该文档覆盖指定主题的次数 
predict_nkt = new int[K][predict_v]; 
predict_phi = new double[K][predict_v]; 
predict_z = new int[predict_v]; 
predict_nk = new int[K]; 
for(int index = 0; index < predict_v; index++){ 
String content = doc.getWordsList().get(index); 
String indexStr = wordIndexMap.get(content); 
if(indexStr.indexOf("-") == -1) 
continue; 
int m = Integer.valueOf(indexStr.substring(0, indexStr.indexOf("-"))); 
int n = Integer.valueOf(indexStr.substring(indexStr.indexOf("-")+1)); 
// Sample from p(z_i|z_-i, w)   
int newTopic = predictSampleTopicZ(m, n);   
predict_z[index] = newTopic;   
predict_nkt[newTopic][index] ++; 
predict_nk[newTopic] ++; 
} 
for(int k = 0; k < K; k++){   
topicProb[k] = (predict_nk[k] + alpha) / (predict_v + K * alpha);   
} 
return getTopic(topicProb);     	 
} 
public int getTopic(double[] topicProp){ 
int maxIndex = 0; 
double maxProp = topicProp[0]; 
Set<String> words = new HashSet<String> (); 
for(int k = 1; k < K; k ++){ 
if(maxProp < topicProp[k]){ 
maxProp = topicProp[k]; 
maxIndex = k; 
} 
} 
return maxIndex; 
} 
public int predictSampleTopicZ(int m, int n){ 
// TODO Auto-generated method stub   
// Sample from p(z_i|z_-i, w) using Gibbs upde rule                    
//Compute p(z_i = k|z_-i, w)   
double [] p = new double[K];   
for(int k = 0; k < K; k++){   
p[k] = (nkt[k][doc[m][n]] + beta) / (nktSum[k] + V * beta) * (nmk[m][k] + alpha) / (nmkSum[m] + K * alpha);   
}   
//Sample a new topic label for w_{m, n} like roulette   
//Compute cumulated probability for p   
for(int k = 1; k < K; k++){   
p[k] += p[k - 1];   
}   
double u = Math.random() * p[K - 1]; //p[] is unnormalised   
int newTopic;   
for(newTopic = 0; newTopic < K; newTopic++){   
if(u < p[newTopic]){   
break;   
}   
}   
//Add new topic label for w_{m, n}    
return newTopic;   
} 
public void saveIteratedModel(int iters, Documents docSet) throws IOException {   
// TODO Auto-generated method stub   
//lda.params lda.phi lda.theta lda.tassign lda.twords   
//lda.params  
String resultPath = "ldaResult/";  
String modelName = "lda_" + iters;   
ArrayList<String> lines = new ArrayList<String>();   
lines.add("alpha = " + alpha);   
lines.add("beta = " + beta);   
lines.add("topicNum = " + K);   
lines.add("docNum = " + M);   
lines.add("termNum = " + V);   
lines.add("iterations = " + iterations);   
lines.add("saveStep = " + saveStep);   
lines.add("beginSaveIters = " + beginSaveIters);   
FileUtil.writeLines(resultPath + modelName + ".params", lines);   
//lda.phi K*V   
BufferedWriter writer = new BufferedWriter(new FileWriter(resultPath + modelName + ".phi"));          
for (int i = 0; i < K; i++){   
for (int j = 0; j < V; j++){   
writer.write(phi[i][j] + "/t");   
}   
writer.write("/n");   
}   
writer.close();   
//lda.theta M*K   
writer = new BufferedWriter(new FileWriter(resultPath + modelName + ".theta"));   
for(int i = 0; i < M; i++){   
for(int j = 0; j < K; j++){   
writer.write(theta[i][j] + "/t");   
}   
writer.write("/n");   
}   
writer.close();   
//lda.tassign   
writer = new BufferedWriter(new FileWriter(resultPath + modelName + ".tassign"));   
for(int m = 0; m < M; m++){   
for(int n = 0; n < doc[m].length; n++){   
writer.write(doc[m][n] + ":" + z[m][n] + "/t");   
}   
writer.write("/n");   
}   
writer.close();   
List<Word> appendwords = new ArrayList<Word> ();   
//lda.twords phi[][] K*V   
writer = new BufferedWriter(new FileWriter(resultPath + modelName + ".twords"));   
int topNum = 10; //Find the top 20 topic words in each topic   
for(int i = 0; i < K; i++){   
List<Integer> tWordsIndexArray = new ArrayList<Integer>();    
for(int j = 0; j < V; j++){   
tWordsIndexArray.add(new Integer(j));   
}   
Collections.sort(tWordsIndexArray, new LdaModel.TwordsComparable(phi[i]));   
writer.write("topic " + i + "/t:/t");   
for(int t = 0; t < topNum; t++){   
writer.write(docSet.indexToTermMap.get(tWordsIndexArray.get(t)) + " " + phi[i][tWordsIndexArray.get(t)] + "/t");   
Word word = new Word(docSet.indexToTermMap.get(tWordsIndexArray.get(t))); 
word.setWeight(phi[i][tWordsIndexArray.get(t)]); 
appendwords.add(word); 
}   
writer.write("/n");   
}         
writer.close();  
//lda.words 
writer = new BufferedWriter(new FileWriter(resultPath + modelName + ".words")); 
for(Word word : appendwords){ 
if(word.getContent().trim().equals("")) 
continue; 
writer.write(word.getContent()+"/t"+word.getWeight()+"/n"); 
} 
writer.close(); 
}   
public class TwordsComparable implements Comparator<Integer> {   
public double [] sortProb; // Store probability of each word in topic k   
public TwordsComparable (double[] sortProb){   
this.sortProb = sortProb;   
}   
@Override   
public int compare(Integer o1, Integer o2) {   
// TODO Auto-generated method stub   
//Sort topic word index according to the probability of each word in topic k   
if(sortProb[o1] > sortProb[o2]) return -1;   
else if(sortProb[o1] < sortProb[o2]) return 1;   
else return 0;   
}   
}  
public static void main(String[] args){ 
} 
}

public class Documents { 
ArrayList<Document> docs;    
Map<String, Integer> termToIndexMap;   
ArrayList<String> indexToTermMap;   
Map<String,Integer> termCountMap; 
private static NLPIRUtil npr = new NLPIRUtil(); 
private static Set<String> stopWordsSet = SegmentWordsResult.getStopWordsSet(); 
private Map<Word,Integer> wordDocMap; 
private Map<Integer, LinkedList<String>> docWordsList;//key:第i篇文档，value：单词列表，为了与lda模型中的doc[m][n]的索引对应 
public Documents(){   
docs = new ArrayList<Document>();   
termToIndexMap = new HashMap<String, Integer>();   
indexToTermMap = new ArrayList<String>();   
termCountMap = new HashMap<String, Integer>(); 
this.wordDocMap = new HashMap<Word, Integer> (); 
this.docWordsList = new HashMap<Integer, LinkedList<String>> (); 
}   
public Map<String, Integer> getTermCountMap() { 
return termCountMap; 
} 
public void setTermCountMap(Map<String, Integer> termCountMap) { 
this.termCountMap = termCountMap; 
} 
public Map<Word, Integer> getWordDocMap() { 
return wordDocMap; 
} 
public void setWordDocMap(Map<Word, Integer> wordDocMap) { 
this.wordDocMap = wordDocMap; 
} 
public Map<Integer, LinkedList<String>> getDocWordsList() { 
return docWordsList; 
} 
public void setDocWordsList(Map<Integer, LinkedList<String>> docWordsList) { 
this.docWordsList = docWordsList; 
} 
public void readDocs(String docsPath){  
int index = 0; 
for(File docFile : new File(docsPath).listFiles()){  
Document doc = new Document(docFile.getAbsolutePath(), termToIndexMap, indexToTermMap, termCountMap);   
docs.add(doc);  
for(Word word : doc.getWordMap().keySet()){ 
if(this.wordDocMap.containsKey(word)) 
this.wordDocMap.put(word, this.wordDocMap.get(word)); 
else 
this.wordDocMap.put(word, 1); 
} 
this.docWordsList.put(index++, doc.getWordsList()); 
}  
}   
}

public class Document { 
private static NLPIRUtil npr = new NLPIRUtil(); 
private static Set<String> stopWordsSet = SegmentWordsResult.getStopWordsSet(); 
private String docName;   
int[] docWords;  
private int wordCount; 
private Map<Word, Integer> wordMap ; 
private LinkedList<String> wordsList;//为了和docWords的索引对应，即单词内容对应索引值 
public int getWordCount() { 
return wordCount; 
} 
public void setWordCount(int wordCount) { 
this.wordCount = wordCount; 
} 
public Map<Word, Integer> getWordMap() { 
return wordMap; 
} 
public void setWordMap(Map<Word, Integer> wordMap) { 
this.wordMap = wordMap; 
} 
public LinkedList<String> getWordsList() { 
return wordsList; 
} 
public void setWordsList(LinkedList<String> wordsList) { 
this.wordsList = wordsList; 
} 
public Document(String docContent){  
this.wordMap = new HashMap<Word, Integer> (); 
this.wordsList = new LinkedList<String> (); 
String splitResult = npr.NLPIR_ParagraphProcess(ProcessMessage.dealWithSentence(docContent), 0); 
String[] wordsArray = splitResult.split(" "); 
this.docWords = new int[wordsArray.length]; 
int index = 0;    
//Transfer word to index 
for(String str : wordsArray){ 
String content = ProcessMessage.dealSpecialString(str); 
Word word = new Word(content); 
if(ProcessKeyWords.remove(word) || stopWordsSet.contains(content))   	 
continue; 
else if(content.length() <= 1 || RegexMatch.specialMatch(content)) 
continue; 
this.wordCount ++; 
if(!wordMap.containsKey(content)){ 
int newIndex = wordMap.size();   
wordMap.put(word, 1); 
docWords[index++] = newIndex; 
}else{ 
wordMap.put(word, wordMap.get(word)+1); 
docWords[index++] = wordMap.get(content); 
} 
this.wordsList.add(content); 
} 
} 
public Document(String filePath,Map<String, Integer> termToIndexMap, ArrayList<String> indexToTermMap, Map<String, Integer> termCountMap){   
this(FileUtil.readContent(filePath)); 
this.docName = filePath;  
this.wordMap = new HashMap<Word, Integer> (); 
this.wordsList = new LinkedList<String> (); 
//Read file and initialize word index array     
String docContent = FileUtil.readContent(docName);  
String splitResult = npr.NLPIR_ParagraphProcess(docContent, 0); 
String[] wordsArray = splitResult.split(" "); 
this.docWords = new int[wordsArray.length]; 
int index = 0;    
//Transfer word to index 
for(String str : wordsArray){ 
String content = ProcessMessage.dealSpecialString(str); 
Word word = new Word(content); 
if(ProcessKeyWords.remove(word) || stopWordsSet.contains(content))   	 
continue; 
else if(ProcessKeyWords.isMeaninglessWord(content)) 
continue; 
this.wordCount ++; 
if(!termToIndexMap.containsKey(content)){ 
int newIndex = termToIndexMap.size();   
termToIndexMap.put(str, newIndex);   
indexToTermMap.add(str);   
termCountMap.put(str, new Integer(1));  
docWords[index++] = newIndex; 
}else{ 
termCountMap.put(content, termCountMap.get(content) + 1);  
docWords[index++] = termToIndexMap.get(content); 
} 
this.wordsList.add(content); 
if(wordMap.containsKey(word)) 
wordMap.put(word, wordMap.get(word)+1); 
else 
wordMap.put(word, 1); 
} 
}   
public boolean isNoiseWord(String string) {   
// TODO Auto-generated method stub   
string = string.toLowerCase().trim();   
Pattern MY_PATTERN = Pattern.compile(".*[a-zA-Z]+.*");   
Matcher m = MY_PATTERN.matcher(string);   
// filter @xxx and URL   
if(string.matches(".*www//..*") || string.matches(".*//.com.*") ||    
string.matches(".*http:.*") )   
return true;   
else   
return false;   
}   
}

上述中的LdaModel中包含了预测新样本的方法predictNewSampleTopic，返回的是该样本的最大概率主题索引，LdaGibbsSampling中是训练LDA主题模型的流程 
主题-单词分布的部分结果如下：

topic 0 : ⒐ 0.0029859442729502916 住宅 0.002257665153592825制造 0.002257665153592825 行为 0.002257665153592825收益 0.0015293860342353582 西北 0.0015293860342353582红星 0.0015293860342353582 轻松 0.0015293860342353582小商品 0.0015293860342353582 搜房网 0.0015293860342353582

topic 1
:
贵宾 0.0030435749795287848
商城 0.0023012396413832903
太平洋保险 0.0015589043032377958
建设 0.0015589043032377958
储蓄 0.0015589043032377958
周四 0.0015589043032377958
完成 0.0015589043032377958
区内 0.0015589043032377958
王志钢 0.0015589043032377958
872944 0.0015589043032377958

topic 2
:
油田 0.0017282527405768633
雀巢 0.0017282527405768633
金千 0.0017282527405768633
山腰 9.052753448486328E-4

代办 9.052753448486328E-4
洋房 9.052753448486328E-4
月饼 9.052753448486328E-4
三星 9.052753448486328E-4
集成 9.052753448486328E-4
大桥 9.052753448486328E-4

topic 3
:
美容 0.0016053818399086595
疯狂 0.0016053818399086595
获取 0.0016053818399086595
名牌 0.0016053818399086595
风神 0.0016053818399086595
小额 0.0016053818399086595
璀璨 0.0016053818399086595
一千 0.0016053818399086595
专注 0.0016053818399086595
发放 0.0016053818399086595

topic 4
:
焦点 0.002957939635962248
搜狐 0.002236490836367011

房屋 0.002236490836367011
玉兰 0.002236490836367011
短期 0.002236490836367011
理疗 0.002236490836367011
4001080000 0.0015150421531870961
命题 0.0015150421531870961
公开 0.0015150421531870961
乐器 0.0015150421531870961

topic 5
:
实验 0.0023698494769632816
每块 0.0023698494769632816
收费 0.0023698494769632816
博览 0.0016053818399086595
重新 0.0016053818399086595
任意 0.0016053818399086595
借款 0.0016053818399086595
保底 0.0016053818399086595
预期 0.0016053818399086595
初二 0.0016053818399086595

topic 6
:
宗旨 0.0016625761054456234
陈勇军 0.0016625761054456234
拨打 0.0016625761054456234
家人 0.0016625761054456234
工业 0.0016625761054456234
百货店 0.0016625761054456234
实业 0.0016625761054456234
6222024000068818521 0.0016625761054456234
18692297994 0.0016625761054456234
13300 0.0016625761054456234

topic 7
:
→ 0.005167018622159958
餐厅 0.00298377126455307
保修 0.00298377126455307
英语 0.0022560220677405596

红 0.0022560220677405596
普通 0.0022560220677405596
学习 0.001528272987343371
龙湖 0.001528272987343371
电大 0.001528272987343371
任意 0.001528272987343371

topic 8
:
登陆 0.0025078877806663513
食宿 0.001698891632258892
急需 0.001698891632258892
建行 0.001698891632258892
葡萄酒 0.001698891632258892
新版 0.001698891632258892
富豪 0.001698891632258892
对比 0.001698891632258892
泥工 0.001698891632258892
相信 8.898956584744155E-4

topic 9
:
体育 0.7940398454666138
活动 0.005577780772000551
优惠 0.0038460372015833855
欢迎 0.003806901630014181
银行 0.0032981408294290304
电话 0.003268789267167449
联系 0.0031611667945981026
公司 0.002769812010228634
地址 0.0024860799312591553
】 0.002339322119951248

topic 10
:
年级 0.0023899467196315527

车主 0.0023899467196315527
过程 0.0016189961461350322
华联 0.0016189961461350322
家电 0.0016189961461350322
大业 0.0016189961461350322
时代 0.0016189961461350322
迪赛尼斯 0.0016189961461350322
稀缺 0.0016189961461350322
稳定 0.0016189961461350322

topic 11
:
利率 0.002570267766714096
知名 0.002570267766714096
南湖 0.0017411491135135293
实现 0.0017411491135135293
立秋 0.0017411491135135293
就读 0.0017411491135135293
罗马 0.0017411491135135293
广电局 0.0017411491135135293
独具 0.0017411491135135293
静候 0.0017411491135135293

topic 12
:
哥哥 0.0029536776710301638
家里 0.0029536776710301638
化妆 0.0029536776710301638
名品 0.0022332684602588415

一 0.0022332684602588415
四川 0.0015128592494875193
二手车 0.0015128592494875193
订购 0.0015128592494875193
多种 0.0015128592494875193
潜力 0.0015128592494875193

topic 13
:
建行 0.002435001078993082
开发商 0.0016495168674737215
美容 0.0016495168674737215
奔驰 0.0016495168674737215
比例 0.0016495168674737215
英伦 0.0016495168674737215
开通 0.0016495168674737215
开班 0.0016495168674737215
打开 0.0016495168674737215
英国 0.0016495168674737215

topic 14
:
增值 0.002355444012209773
[验] 0.002355444012209773
公开 0.0015956234419718385
打印机 0.0015956234419718385
家中 0.0015956234419718385
宾馆 0.0015956234419718385
12000 0.0015956234419718385
渠道 0.0015956234419718385
租赁 0.0015956234419718385
无效 0.0015956234419718385

topic 15
:
自由 0.0024857670068740845

巴拉巴 0.0024857670068740845

丰 0.0024857670068740845
朝阳 0.001683906652033329
家人 0.001683906652033329
84725588 0.001683906652033329
老弟 0.001683906652033329
商住 0.001683906652033329
县委 0.001683906652033329
德国 8.820463554002345E-4

topic 16
:
￥10亿 0.002975110663101077
楼下 0.002249473938718438
感恩 0.002249473938718438
独栋 0.002249473938718438
前来 0.0015238370979204774
手机 0.0015238370979204774
申请 0.0015238370979204774

乐 0.0015238370979204774
考点 0.0015238370979204774
3008300 0.0015238370979204774

topic 17
:
批发 0.00239548715762794
总监 0.0016227493761107326
车子 0.0016227493761107326
饭店 0.0016227493761107326
伙伴 0.0016227493761107326
直属 0.0016227493761107326
事后 0.0016227493761107326
翰林 0.0016227493761107326
专题片 0.0016227493761107326
装修 8.500116528011858E-4

topic 18
:
期待 0.0024758405052125454

价 0.0016771822702139616
你好 0.0016771822702139616
决定 0.0016771822702139616
助剂 0.0016771822702139616
人员 0.0016771822702139616
雄伟 0.0016771822702139616
只用 0.0016771822702139616
享受 8.785240934230387E-4
四川 8.785240934230387E-4

topic 19
:
房价 0.003103474387899041
底价 0.0023465293925255537
湖南 0.0015895843971520662

凡 0.0015895843971520662
送礼 0.0015895843971520662
恒大 0.0015895843971520662
一生 0.0015895843971520662
代言人 0.0015895843971520662
专车 0.0015895843971520662
大唐 0.0015895843971520662

topic 20
:
企业主 0.0023483068216592073
讲师 0.0023483068216592073

6222021001055293358 0.0023483068216592073
首发 0.0015907884808257222
认购 0.0015907884808257222
请问 0.0015907884808257222
发布 0.0015907884808257222
中午 0.0015907884808257222
开幕 0.0015907884808257222
⒍ 0.0015907884808257222

topic 21
:
重新 0.002323663793504238
帮忙 0.002323663793504238
85654475 0.002323663793504238

宾 0.002323663793504238

中国 0.0015740948729217052
学历 0.0015740948729217052
＂ 0.0015740948729217052
温州 0.0015740948729217052
好久 0.0015740948729217052
钢板 0.0015740948729217052

topic 22
:
可口 0.0024103878531605005
形象 0.0024103878531605005
减轻 0.0024103878531605005
高层 0.0016328433994203806
爸爸 0.0016328433994203806
基金 0.0016328433994203806
营业额 0.0016328433994203806
意大利 0.0016328433994203806
正常 0.0016328433994203806
吉智 0.0016328433994203806

topic 23
:
关系 0.0024738647043704987
经营 0.0016758438432589173
美容 0.0016758438432589173
梦想 0.0016758438432589173
喷漆 0.0016758438432589173
肌肤 0.0016758438432589173
刘汉琳 0.0016758438432589173
索菲 0.0016758438432589173
依依 0.0016758438432589173
欢迎 8.778230403549969E-4

topic 24
:
考试 0.0016652129124850035
上班 0.0016652129124850035
金条 0.0016652129124850035

宝 0.0016652129124850035
澳门 0.0016652129124850035
粘贴 0.0016652129124850035
收缩 0.0016652129124850035
18800574923 0.0016652129124850035
豪华 8.722544298507273E-4
老师 8.722544298507273E-4

topic 25
:
长期 0.0030594731215387583
开发区 0.0023132602218538523
低价 0.0023132602218538523
⑥ 0.0023132602218538523
转告 0.0023132602218538523

新 0.0015670472057536244
得到 0.0015670472057536244
[通] 0.0015670472057536244
融资 0.0015670472057536244
万科 0.0015670472057536244

topic 26
:
开发区 0.002339445985853672
石油 0.0015847859904170036
宁波 0.0015847859904170036
更换 0.0015847859904170036
不用 0.0015847859904170036
会议 0.0015847859904170036
初三 0.0015847859904170036
汽车站 0.0015847859904170036
抽空 0.0015847859904170036
实用 0.0015847859904170036

topic 27
:
代办 0.0016745076281949878
代表 0.0016745076281949878
女性 0.0016745076281949878
13825139678 0.0016745076281949878
承担 0.0016745076281949878
影响力 0.0016745076281949878
13934141989 0.0016745076281949878
槐花 0.0016745076281949878

沐 0.0016745076281949878
过敏 0.0016745076281949878

topic 28
:
婚礼 0.00862991251051426
海尔 0.002210969338193536
电影 0.002210969338193536
小乔 0.002210969338193536
15953174009 0.002210969338193536
茶店 0.002210969338193536
7627292. 0.002210969338193536
15985917304 0.002210969338193536
新余 0.001497753313742578
资料 0.001497753313742578

topic 29
:
【 0.021667908877134323

你 0.015670640394091606
您好 0.01555958017706871
光临 0.014560035429894924

尊敬 0.014337914064526558
现在 0.013005186803638935
】 0.012338823638856411
享受 0.010783976875245571
信用 0.009451250545680523
详情 0.007896402850747108

topic 30
:
西吉 0.0024778195656836033
封顶 0.0016785229090601206
押金 0.0016785229090601206
海外 0.0016785229090601206
澜庭 0.0016785229090601206
账户 0.0016785229090601206
原因 0.0016785229090601206

6222021001036927348 0.0016785229090601206
欧莱雅 0.0016785229090601206
推荐 8.792263106442988E-4

原创文章，作者：ItWorker，如若转载，请注明出处：https://blog.ytso.com/9510.html

LDA主题模型的java代码实现详解大数据

相关推荐

发表回复