In Files

Parent

Class/Module Index [+]

Quicksearch

Ferret::Search::Searcher

Summary

The Searcher class basically performs the task that Ferret was built for. It searches the index. To search the index the Searcher class wraps an IndexReader so many of the tasks that you can perform on an IndexReader are also available on a searcher including, most importantly, accessing stored documents.

The main methods that you need to know about when using a Searcher are the search methods. There is the Searcher#search_each method which iterates through the results by document id and score and there is the Searcher#search method which returns a TopDocs object. Another important difference to note is that the Searcher#search_each method normalizes the score to a value in the range 0.0..1.0 if the max_score is greater than 1.0. Searcher#search does not. Apart from that they take the same parameters and work the same way.

Example

searcher = Searcher.new("/path/to/index")

searcher.search_each(TermQuery.new(:content, "ferret")
                     :filter => RangeFilter.new(:date, :< => "2006"),
                     :sort => "date DESC, title") do |doc_id, score|
    puts "#{searcher[doc_id][title] scored #{score}"
end

Public Class Methods

new(obj) → Searcher click to toggle source

Create a new Searcher object. dir can either be a string path to an index directory on the file-system, an actual Ferret::Store::Directory object or a Ferret::Index::IndexReader. You should use the IndexReader for searching multiple indexes. Just open the IndexReader on multiple directories.

static VALUE
frb_sea_init(VALUE self, VALUE obj)
{
    Store *store = NULL;
    IndexReader *ir = NULL;
    Searcher *sea;
    if (TYPE(obj) == T_STRING) {
        frb_create_dir(obj);
        store = open_fs_store(StringValueCStr(obj));
        ir = ir_open(store);
        DEREF(store);
        FRT_GET_IR(obj, ir);
    } else {
        Check_Type(obj, T_DATA);
        if (rb_obj_is_kind_of(obj, cDirectory) == Qtrue) {
            Data_Get_Struct(obj, Store, store);
            ir = ir_open(store);
            FRT_GET_IR(obj, ir);
        } else if (rb_obj_is_kind_of(obj, cIndexReader) == Qtrue) {
            Data_Get_Struct(obj, IndexReader, ir);
        } else {
            rb_raise(rb_eArgError, "Unknown type for argument to IndexSearcher.new");
        }
    }

    sea = isea_new(ir);
    ((IndexSearcher *)sea)->close_ir = false;
    Frt_Wrap_Struct(self, &frb_sea_mark, &frb_sea_free, sea);
    object_add(sea, self);

    return self;
}

Public Instance Methods

get_document(doc_id) → LazyDoc click to toggle source
searcher[doc_id] → LazyDoc

Retrieve a document from the index. See LazyDoc for more details on the document returned. Documents are referenced internally by document ids which are returned by the Searchers search methods.

static VALUE
frb_sea_doc(VALUE self, VALUE rdoc_id)
{
    GET_SEA();
    return frb_get_lazy_doc(sea->get_lazy_doc(sea, FIX2INT(rdoc_id)));
}
close → nil click to toggle source

Close the searcher. The garbage collector will do this for you or you can call this method explicitly.

static VALUE
frb_sea_close(VALUE self)
{
    GET_SEA();
    Frt_Unwrap_Struct(self);
    object_del(sea);
    sea->close(sea);
    return Qnil;
}
doc_freq(field, term) → integer click to toggle source

Return the number of documents in which the term term appears in the field field.

static VALUE
frb_sea_doc_freq(VALUE self, VALUE rfield, VALUE rterm)
{
    GET_SEA();
    return INT2FIX(sea->doc_freq(sea,
                                 frb_field(rfield),
                                 StringValuePtr(rterm)));
}
explain(query, doc_id) → Explanation click to toggle source

Create an explanation object to explain the score returned for a particular document at doc_id in the index for the query query.

Usually used like this;

puts searcher.explain(query, doc_id).to_s
static VALUE
frb_sea_explain(VALUE self, VALUE rquery, VALUE rdoc_id)
{
    GET_SEA();
    Query *query;
    Explanation *expl;
    Data_Get_Struct(rquery, Query, query);
    expl = sea->explain(sea, query, FIX2INT(rdoc_id));
    return Data_Wrap_Struct(cExplanation, NULL, &expl_destroy, expl);
}
get_document(doc_id) → LazyDoc click to toggle source
searcher[doc_id] → LazyDoc

Retrieve a document from the index. See LazyDoc for more details on the document returned. Documents are referenced internally by document ids which are returned by the Searchers search methods.

static VALUE
frb_sea_doc(VALUE self, VALUE rdoc_id)
{
    GET_SEA();
    return frb_get_lazy_doc(sea->get_lazy_doc(sea, FIX2INT(rdoc_id)));
}
highlight(query, doc_id, field, options = {}) → Array click to toggle source

Returns an array of strings with the matches highlighted.

Options

:excerpt_length

Default: 150. Length of excerpt to show. Highlighted terms will be in the centre of the excerpt. Set to :all to highlight the entire field.

:num_excerpts

Default: 2. Number of excerpts to return.

:pre_tag

Default: “<b>”. Tag to place to the left of the match. You’ll probably want to change this to a “<span>” tag with a class. Try “033[7m” for use in a terminal.

:post_tag

Default: “</b>”. This tag should close the :pre_tag. Try tag “033[m” in the terminal.

:ellipsis

Default: “…”. This is the string that is appended at the beginning and end of excerpts (unless the excerpt hits the start or end of the field. You’ll probably want to change this so a Unicode ellipsis character.

static VALUE
frb_sea_highlight(int argc, VALUE *argv, VALUE self)
{
    GET_SEA();
    VALUE rquery, rdoc_id, rfield, roptions, v;
    Query *query;
    int excerpt_length = 150;
    int num_excerpts = 2;
    char *pre_tag = "<b>";
    char *post_tag = "</b>";
    char *ellipsis = "...";
    char **excerpts;

    rb_scan_args(argc, argv, "31", &rquery, &rdoc_id, &rfield, &roptions);
    Data_Get_Struct(rquery, Query, query);
    if (argc > 3) {
        if (TYPE(roptions) != T_HASH) {
           rb_raise(rb_eArgError, "The fourth argument to Searcher#highlight must be a hash");
        }
        if (Qnil != (v = rb_hash_aref(roptions, sym_num_excerpts))) {
            num_excerpts =  FIX2INT(v);
        }
        if (Qnil != (v = rb_hash_aref(roptions, sym_excerpt_length))) {
            if (v == sym_all) {
                num_excerpts = 1;
                excerpt_length = INT_MAX/2;
            }
            else {
                excerpt_length = FIX2INT(v);
            }
        }
        if (Qnil != (v = rb_hash_aref(roptions, sym_pre_tag))) {
            pre_tag = rs2s(rb_obj_as_string(v));
        }
        if (Qnil != (v = rb_hash_aref(roptions, sym_post_tag))) {
            post_tag = rs2s(rb_obj_as_string(v));
        }
        if (Qnil != (v = rb_hash_aref(roptions, sym_ellipsis))) {
            ellipsis = rs2s(rb_obj_as_string(v));
        }
    }
    
    if ((excerpts = searcher_highlight(sea,
                                       query,
                                       FIX2INT(rdoc_id),
                                       frb_field(rfield),
                                       excerpt_length,
                                       num_excerpts,
                                       pre_tag,
                                       post_tag,
                                       ellipsis)) != NULL) {
        const int size = ary_size(excerpts);
        int i;
        VALUE rexcerpts = rb_ary_new2(size);

        for (i = 0; i < size; i++) {
          rb_ary_store(rexcerpts, i, rb_str_new2(excerpts[i]));
        }
        ary_destroy(excerpts, &free);
        return rexcerpts;
    }
    return Qnil;
}
max_doc → number click to toggle source

Returns 1 + the maximum document id in the index. It is the document_id that will be used by the next document added to the index. If there are no deletions, this number also refers to the number of documents in the index.

static VALUE
frb_sea_max_doc(VALUE self)
{
    GET_SEA();
    return INT2FIX(sea->max_doc(sea));
}
reader → IndexReader click to toggle source

Return the IndexReader wrapped by this searcher.

static VALUE
frb_sea_get_reader(VALUE self, VALUE rterm)
{
    GET_SEA();
    return object_get(((IndexSearcher *)sea)->ir);
}
scan(query, options = {}) → Array (doc_nums) click to toggle source

Run a query through the Searcher on the index, ignoring scoring and starting at :start_doc and stopping when :limit matches have been found. It returns an array of the matching document numbers.

There is a big performance advange when using this search method on a very large index when there are potentially thousands of matching documents and you only want say 50 of them. The other search methods need to look at every single match to decide which one has the highest score. This search method just needs to find :limit number of matches before it returns.

Options

:start_doc

Default: 0. The start document to start the search from. NOTE very carefully that this is not the same as the :offset parameter used in the other search methods which refers to the offset in the result-set. This is the document to start the scan from. So if you scanning through the index in increments of 50 documents at a time you need to use the last matched doc in the previous search to start your next search. See the example below.

:limit

Default: 50. This is the number of results you want returned, also called the page size. Set :limit to :all to return all results.

TODO: add option to return loaded documents instead

Options

start_doc = 0
begin
  results = @searcher.scan(query, :start_doc => start_doc)
  yield results # or do something with them
  start_doc = results.last
  # start_doc will be nil now if results is empty, ie no more matches
end while start_doc
static VALUE
frb_sea_scan(int argc, VALUE *argv, VALUE self)
{
    Query *q;
    int i, count;
    VALUE rval, rquery, roptions;
    int *doc_array;
    VALUE rdoc_array;
    int start_doc = 0, limit = 50;
    GET_SEA();
    rb_scan_args(argc, argv, "11", &rquery, &roptions);
    Data_Get_Struct(rquery, Query, q);

    if (Qnil != roptions) {
        Check_Type(roptions, T_HASH);
        if (Qnil != (rval = rb_hash_aref(roptions, sym_start_doc))) {
            Check_Type(rval, T_FIXNUM);
            start_doc = FIX2INT(rval);
            if (start_doc < 0) {
                rb_raise(rb_eArgError, ":start_doc must be >= 0");
            }
        }
        if (Qnil != (rval = rb_hash_aref(roptions, sym_limit))) {
            if (TYPE(rval) == T_FIXNUM) {
                limit = FIX2INT(rval);
                if (limit <= 0) {
                    rb_raise(rb_eArgError, ":limit must be > 0");
                }
            }
            else if (rval == sym_all) {
                limit = INT_MAX;
            }
            else {
                rb_raise(rb_eArgError, "%s is not a sensible :limit value "
                         "Please use a positive integer or :all",
                         rs2s(rb_obj_as_string(rval)));
            }
        }
    }

#ifndef FRT_RUBY_VERSION_1_9
    rb_thread_critical = Qtrue;
#endif
    doc_array = ALLOC_N(int, limit);
    count = searcher_search_unscored(sea, q, doc_array, limit, start_doc);
    rdoc_array = rb_ary_new2(count);
    for (i = 0; i < count; i++) {
      rb_ary_store(rdoc_array, i, INT2FIX(doc_array[i]));
    }
    free(doc_array);
#ifndef FRT_RUBY_VERSION_1_9
    rb_thread_critical = 0;
#endif
    return rdoc_array;
}
search(query, options = {}) → TopDocs click to toggle source

Run a query through the Searcher on the index. A TopDocs object is returned with the relevant results. The query is a built in Query object. Here are the options;

Options

:offset

Default: 0. The offset of the start of the section of the result-set to return. This is used for paging through results. Let’s say you have a page size of 10. If you don’t find the result you want among the first 10 results then set :offset to 10 and look at the next 10 results, then 20 and so on.

:limit

Default: 10. This is the number of results you want returned, also called the page size. Set :limit to :all to return all results

:sort

A Sort object or sort string describing how the field should be sorted. A sort string is made up of field names which cannot contain spaces and the word “DESC” if you want the field reversed, all separated by commas. For example; “rating DESC, author, title”. Note that Ferret will try to determine a field’s type by looking at the first term in the index and seeing if it can be parsed as an integer or a float. Keep this in mind as you may need to specify a fields type to sort it correctly. For more on this, see the documentation for SortField

:filter

a Filter object to filter the search results with

:filter_proc

a filter Proc is a Proc which takes the doc_id, the score and the Searcher object as its parameters and returns either a Boolean value specifying whether the result should be included in the result set, or a Float between 0 and 1.0 to be used as a factor to scale the score of the object. This can be used, for example, to weight the score of a matched document by it’s age.

static VALUE
frb_sea_search(int argc, VALUE *argv, VALUE self)
{
    GET_SEA();
    VALUE rquery, roptions;
    Query *query;
    rb_scan_args(argc, argv, "11", &rquery, &roptions);
    Data_Get_Struct(rquery, Query, query);
    return frb_get_td(frb_sea_search_internal(query, roptions, sea), self);
}
search_each(query, options = {}) {|doc_id, score| do_something} click to toggle source
→ total_hits

Run a query through the Searcher on the index. A TopDocs object is returned with the relevant results. The query is a Query object. The Searcher#search_each method yields the internal document id (used to reference documents in the Searcher object like this; +searcher+) and the search score for that document. It is possible for the score to be greater than 1.0 for some queries and taking boosts into account. This method will also normalize scores to the range 0.0..1.0 when the max-score is greater than 1.0. Here are the options;

Options

:offset

Default: 0. The offset of the start of the section of the result-set to return. This is used for paging through results. Let’s say you have a page size of 10. If you don’t find the result you want among the first 10 results then set :offset to 10 and look at the next 10 results, then 20 and so on.

:limit

Default: 10. This is the number of results you want returned, also called the page size. Set :limit to :all to return all results

:sort

A Sort object or sort string describing how the field should be sorted. A sort string is made up of field names which cannot contain spaces and the word “DESC” if you want the field reversed, all separated by commas. For example; “rating DESC, author, title”. Note that Ferret will try to determine a field’s type by looking at the first term in the index and seeing if it can be parsed as an integer or a float. Keep this in mind as you may need to specify a fields type to sort it correctly. For more on this, see the documentation for SortField

:filter

a Filter object to filter the search results with

:filter_proc

a filter Proc is a Proc which takes the doc_id, the score and the Searcher object as its parameters and returns a Boolean value specifying whether the result should be included in the result set.

static VALUE
frb_sea_search_each(int argc, VALUE *argv, VALUE self)
{
    int i;
    Query *q;
    float max_score;
    TopDocs *td;
    VALUE rquery, roptions, rtotal_hits;
    GET_SEA();

    rb_scan_args(argc, argv, "11", &rquery, &roptions);

#ifndef FRT_RUBY_VERSION_1_9
    rb_thread_critical = Qtrue;
#endif
    Data_Get_Struct(rquery, Query, q);
    td = frb_sea_search_internal(q, roptions, sea);

    max_score = (td->max_score > 1.0) ? td->max_score : 1.0;

    /* yield normalized scores */
    for (i = 0; i < td->size; i++) {
        rb_yield_values(2, INT2FIX(td->hits[i]->doc),
                        rb_float_new((double)(td->hits[i]->score/max_score)));
    }

    rtotal_hits = INT2FIX(td->total_hits);
    td_destroy(td);

#ifndef FRT_RUBY_VERSION_1_9
    rb_thread_critical = 0;
#endif
    return rtotal_hits;
}

[Validate]

Generated with the Darkfish Rdoc Generator 2.