- // This conditional may be terribly wrong
- // It was there to address the situation where vec[0] == vec[i]
- // which leads to idfvec[i] == 0... not sure about this
- // Traditional TF-IDF may assume that a word that occurs in every
- // record is irrelevant, but this is actually something we will
- // see a lot
- if ((idfvec[i] = log((float) rel->doc_frequency_vec[0] /
- rel->doc_frequency_vec[i])) < 0.0000001)
- idfvec[i] = 1;
+ /* add one to nominator idf(t,D) to ensure a value > 0 */
+ idfvec[i] = log((float) (1 + rel->doc_frequency_vec[0]) /
+ rel->doc_frequency_vec[i]);