http://spark.apache.org/docs/latest/ml-features.html#tf-idf
import org.apache.spark.ml.feature._ import org.apache.spark.ml.linalg.SparseVector import org.apache.spark.sql.SparkSession import scala.collection.mutable import scala.io.Source /** * Created by xubc on 2017/6/3. */ object TestX { def main(args: Array[String]): Unit = { val spark = SparkSession.builder .master("local[5]") .appName(this.getClass.getName().stripSuffix("$")) .getOrCreate() val sentenceData = spark.createDataFrame(Seq( (0.0, "Hi I heard about are Spark"), (1.0, "I wish Java could use case spark classes"), (2.0, "Logistic regression regression models are neat I") )).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val wordsData = tokenizer.transform(sentenceData) // HashingTF bow模型 // val hashingTF = new HashingTF() // .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(100) // val featurizedData = hashingTF.transform(wordsData) // CountVectorizer bow模型 val cvModel: CountVectorizerModel = new CountVectorizer() .setInputCol("words").setOutputCol("rawFeatures") .fit(wordsData) val featurizedData = cvModel.transform(wordsData) val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") val idfModel = idf.fit(featurizedData) val rescaledData = idfModel.transform(featurizedData) rescaledData.printSchema() val vocabulary = cvModel.vocabulary println(vocabulary.mkString(",")) rescaledData.show(false) rescaledData.foreach(e => { val label = e.getAs[Double]("label") val str = e.getAs[String]("sentence") val words = e.getAs[mutable.WrappedArray[String]]("words").mkString(",") val tf = e.getAs[SparseVector]("rawFeatures") val originWords = tf.indices.map(i => vocabulary(i)).mkString(",") val idf = e.getAs[SparseVector]("features") println( s"""$label $str | $words | $tf $originWords | $idf""".stripMargin) }) } }通过CountVectorizer模型的vocabulary可以回溯tf-idf权重高的词,但是HashTF采用的hash算法能够更高效率计算出tf-idf无法回溯到具体词 1.0 I wish Java could use case spark classes i,wish,java,could,use,case,spark,classes (16,[0,2,4,5,7,8,13,14],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]) i,spark,could,java,wish,case,classes,use (16,[0,2,4,5,7,8,13,14],[0.0,0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453]) 2.0 Logistic regression regression models are neat I logistic,regression,regression,models,are,neat,i (16,[0,1,3,6,9,15],[1.0,2.0,1.0,1.0,1.0,1.0]) i,regression,are,neat,models,logistic (16,[0,1,3,6,9,15],[0.0,1.3862943611198906,0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453]) 0.0 Hi I heard about are Spark hi,i,heard,about,are,spark (16,[0,2,3,10,11,12],[1.0,1.0,1.0,1.0,1.0,1.0]) i,spark,are,about,hi,heard (16,[0,2,3,10,11,12],[0.0,0.28768207245178085,0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453])