- module TopicModels
- import Base.length
- typealias RaggedMatrix{T} Array{Array{T,1},1}
- type Corpus
- documents::RaggedMatrix{Int64}
- weights::RaggedMatrix{Float64}
- Corpus(documents::RaggedMatrix{Int64},
- weights::RaggedMatrix{Float64}) = begin
- return new(
- documents,
- weights
- )
- end
-
- Corpus(documents::RaggedMatrix{Int64}) = begin
- weights = map(documents) do doc
- ones(Float64, length(doc))
- end
- return new(
- documents,
- weights
- )
- end
- end
- type Model
- alphaPrior::Vector{Float64}
- betaPrior::Float64
- topics::Array{Float64,2}
- topicSums::Vector{Float64}
- documentSums::Array{Float64,2}
- assignments::RaggedMatrix{Int64}
- frozen::Bool
- corpus::Corpus
- Model(alphaPrior::Vector{Float64},
- betaPrior::Float64,
- V::Int64,
- corpus::Corpus) = begin
- K = length(alphaPrior)
- m = new(
- alphaPrior,
- betaPrior,
- zeros(Float64, K, V), # topics
- zeros(Float64, K), # topicSums
- zeros(Float64, K, length(corpus.documents)), #documentSums
- fill(Array(Int64, 0), length(corpus.documents)), # assignments
- false,
- corpus
- )
- initializeAssignments(m)
- return m
- end
- Model(trainedModel::Model, corpus::Corpus) = begin
- m = new(
- trainedModel.alphaPrior,
- trainedModel.betaPrior,
- trainedModel.topics,
- trainedModel.topicSums,
- trainedModel.documentSums,
- fill(Array(Int64, 0), length(corpus.documents)),
- true,
- corpus
- )
- initializeAssignments(m)
- return m
- end
- end
- function length(corpus::Corpus)
- return length(corpus.documents)
- end
- function initializeAssignments(model::Model)
- for dd in 1:length(model.corpus)
- @inbounds words = model.corpus.documents[dd]
- @inbounds model.assignments[dd] = fill(0, length(words))
- for ww in 1:length(words)
- @inbounds word = words[ww]
- topic = sampleMultinomial(model.alphaPrior)
- @inbounds model.assignments[dd][ww] = topic
- updateSufficientStatistics(
- word, topic, dd, model.corpus.weights[dd][ww], model)
- end
- end
- return
- end
- function sampleMultinomial(p::Array{Float64,1})
- pSum = sum(p)
- r = rand() * pSum
- K = length(p)
- for k in 1:K
- if r < p[k]
- return k
- else
- r -= p[k]
- end
- end
- return 0
- end
- function wordDistribution(word::Int,
- document::Int,
- model::Model,
- out::Vector{Float64})
- V = size(model.topics, 2)
- for ii in 1:length(out)
- u = (model.documentSums[ii, document] + model.alphaPrior[ii]) *
- (model.topics[ii, word] + model.betaPrior) /
- (model.topicSums[ii] + V * model.betaPrior)
- @inbounds out[ii] = u
- end
- return
- end
- function sampleWord(word::Int,
- document::Int,
- model::Model,
- p::Vector{Float64})
- wordDistribution(word, document, model, p)
- sampleMultinomial(p)
- end
- function updateSufficientStatistics(word::Int64,
- topic::Int64,
- document::Int64,
- scale::Float64,
- model::Model)
- fr = float64(!model.frozen)
- @inbounds model.documentSums[topic, document] += scale
- @inbounds model.topicSums[topic] += scale * fr
- @inbounds model.topics[topic, word] += scale * fr
- return
- end
- function sampleDocument(document::Int,
- model::Model)
- @inbounds words = model.corpus.documents[document]
- Nw = length(words)
- @inbounds weights = model.corpus.weights[document]
- K = length(model.alphaPrior)
- p = Array(Float64, K)
- @inbounds assignments = model.assignments[document]
- for ii in 1:Nw
- @inbounds word = words[ii]
- @inbounds oldTopic = assignments[ii]
- updateSufficientStatistics(word, oldTopic, document, -weights[ii], model)
- newTopic = sampleWord(word, document, model, p)
- @inbounds assignments[ii] = newTopic
- updateSufficientStatistics(word, newTopic, document, weights[ii], model)
- end
- return
- end
- function sampleCorpus(model::Model)
- for ii in 1:length(model.corpus)
- sampleDocument(ii, model)
- end
- return
- end
- # Note, files are zero indexed, but we are 1-indexed.
- function termToWordSequence(term::String)
- parts = split(term, ":")
- fill(int64(parts[1]) + 1, int64(parts[2]))
- end
- # The functions below are designed for public consumption
- function trainModel(model::Model,
- numIterations::Int64)
- for ii in 1:numIterations
- println(string("Iteration ", ii, "..."))
- sampleCorpus(model)
- end
- return
- end
- function topTopicWords(model::Model,
- lexicon::Array{ASCIIString,1},
- numWords::Int64)
- [lexicon[reverse(sortperm(model.topics'[1:end, row]))[1:numWords]]
- for row in 1:size(model.topics,1)]
- end
- function readDocuments(stream)
- lines = readlines(stream)
- convert(
- RaggedMatrix{Int64},
- [apply(vcat, [termToWordSequence(term) for term in split(line, " ")[2:end]])
- for line in lines])
- end
- function readLexicon(stream)
- lines = readlines(stream)
- map(chomp, convert(Array{String,1}, lines))
- end
- export Corpus,
- Model,
- readDocuments,
- readLexicon,
- topTopicWords,
- trainModel
- end
In machine learning and natural language processing, a topic model is a type of statistical model for discovering the abstract "topics" that occur in a collection of documents. Intuitively, given that a document is about a particular topic, one would expect particular words to appear in the document more or less frequently: "dog" and "bone" will appear more often in documents about dogs, "cat" and "meow" will appear in documents about cats, and "the" and "is" will appear equally in both. A document typically concerns multiple topics in different proportions; thus, in a document that is 10% about cats and 90% about dogs, there would probably be about 9 times more dog words than cat words. A topic model captures this intuition in a mathematical framework, which allows examining a set of documents and discovering, based on the statistics of the words in each, what the topics might be and what each document's balance of topics is.
In practice researchers attempt to fit appropriate model parameters to the data corpus using one of several heuristics for maximum likelihood fit. A recent survey by Blei describes this suite of algorithms.[4] Several groups of researchers starting with Papadimitriou et al.[1] have attempted to design algorithms with probable guarantees. Assuming that the data was actually generated by the model in question, they try to design algorithms that probably find the model that was used to create the data. Techniques used here includesingular value decomposition (SVD), the method of moments, and very recently an algorithm based upon non-negative matrix factorization (NMF). This last algorithm also generalizes to topic models that allow correlations among topics


雷达卡




京公网安备 11010802022788号







