Methods

Linguist::Samples

Model for accessing classifier training data.

Constants

DATA
PATH

Path for serialized samples db

ROOT

Path to samples root directory

Public Class Methods

data() click to toggle source

Public: Build Classifier from all samples.

Returns trained Classifier.

# File lib/linguist/samples.rb, line 73
def self.data
  db = {}
  db['extnames'] = {}
  db['interpreters'] = {}
  db['filenames'] = {}

  each do |sample|
    language_name = sample[:language]

    if sample[:extname]
      db['extnames'][language_name] ||= []
      if !db['extnames'][language_name].include?(sample[:extname])
        db['extnames'][language_name] << sample[:extname]
        db['extnames'][language_name].sort!
      end
    end

    if sample[:interpreter]
      db['interpreters'][language_name] ||= []
      if !db['interpreters'][language_name].include?(sample[:interpreter])
        db['interpreters'][language_name] << sample[:interpreter]
        db['interpreters'][language_name].sort!
      end
    end

    if sample[:filename]
      db['filenames'][language_name] ||= []
      db['filenames'][language_name] << sample[:filename]
      db['filenames'][language_name].sort!
    end

    data = File.read(sample[:path])
    Classifier.train!(db, language_name, data)
  end

  db['md5'] = Linguist::MD5.hexdigest(db)

  db
end
each(&block) click to toggle source

Public: Iterate over each sample.

&block - Yields Sample to block

Returns nothing.

# File lib/linguist/samples.rb, line 30
def self.each(&block)
  Dir.entries(ROOT).sort!.each do |category|
    next if category == '.' || category == '..'

    # Skip text and binary for now
    # Possibly reconsider this later
    next if category == 'Text' || category == 'Binary'

    dirname = File.join(ROOT, category)
    Dir.entries(dirname).each do |filename|
      next if filename == '.' || filename == '..'

      if filename == 'filenames'
        Dir.entries(File.join(dirname, filename)).each do |subfilename|
          next if subfilename == '.' || subfilename == '..'

          yield({
            :path    => File.join(dirname, filename, subfilename),
            :language => category,
            :filename => subfilename
          })
        end
      else
        if File.extname(filename) == ""
          raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
        end

        yield({
          :path     => File.join(dirname, filename),
          :language => category,
          :interpreter => File.exist?(filename) ? Linguist.interpreter_from_shebang(File.read(filename)) : nil,
          :extname  => File.extname(filename)
        })
      end
    end
  end

  nil
end

[Validate]

Generated with the Darkfish Rdoc Generator 2.