MongoDB टेक्स्ट इंडेक्स और टेक्स्ट स्कोर में दस्तावेज़ की लंबाई को कैसे संभालता है?

स्कोरिंग स्टेमड मैचों की संख्या पर आधारित होता है, लेकिन एक अंतर्निहित गुणांक भी होता है जो कुल फ़ील्ड लंबाई के सापेक्ष मैचों के स्कोर को समायोजित करता है (स्टॉपवर्ड हटा दिए जाते हैं)। यदि आपके लंबे टेक्स्ट में किसी क्वेरी के लिए अधिक प्रासंगिक शब्द शामिल हैं, तो यह स्कोर में जुड़ जाएगा। लंबा टेक्स्ट जो किसी क्वेरी से मेल नहीं खाता, स्कोर कम कर देगा।

GitHub पर MongoDB 3.2 स्रोत कोड से स्निपेट (src/mongo/db/fts/fts_spec.cpp ):

   for (ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i) {
        const string& term = i->first;
        const ScoreHelperStruct& data = i->second;

        // in order to adjust weights as a function of term count as it
        // relates to total field length. ie. is this the only word or
        // a frequently occuring term? or does it only show up once in
        // a long block of text?

        double coeff = (0.5 * data.count / numTokens) + 0.5;

        // if term is identical to the raw form of the
        // field (untokenized) give it a small boost.
        double adjustment = 1;
        if (raw.size() == term.length() && raw.equalCaseInsensitive(term))
            adjustment += 0.1;

        double& score = (*docScores)[term];
        score += (weight * data.freq * coeff * adjustment);
        verify(score <= MAX_WEIGHT);
    }
}

एक बहुत ही सरल उदाहरण पर लंबाई गुणांक के प्रभाव को देखने के लिए कुछ परीक्षण डेटा सेट करना:

db.articles.insert([
    { headline: "Rock" },
    { headline: "Rocks" },
    { headline: "Rock paper" },
    { headline: "Rock paper scissors" },
])

db.articles.createIndex({ "headline": "text"})

db.articles.find(
    { $text: { $search: "rock" }},
    { _id:0, headline:1, score: { $meta: "textScore" }}
).sort({ score: { $meta: "textScore" }})

व्याख्या किए गए परिणाम:

// Exact match of raw term to indexed field
// Coefficent is 1, plus 0.1 bonus for identical match of raw term
{
  "headline": "Rock",
  "score": 1.1
}

// Match of stemmed term to indexed field ("rocks" stems to "rock")
// Coefficent is 1
{
  "headline": "Rocks",
  "score": 1
}

// Two terms, one matching
// Coefficient is 0.75: (0.5 * 1 match / 2 terms) + 0.5
{
  "headline": "Rock paper",
  "score": 0.75
}

// Three terms, one matching
// Coefficient is 0.66: (0.5 * 1 match / 3 terms) + 0.5
{
  "headline": "Rock paper scissors",
  "score": 0.6666666666666666
}