Codebase list feed2imap / 37fde67 lib / feed2imap / cache.rb
37fde67

Tree @37fde67 (Download .tar.gz)

cache.rb @37fde67raw · history · blame

=begin
Feed2Imap - RSS/Atom Aggregator uploading to an IMAP Server
Copyright (c) 2005 Lucas Nussbaum <lucas@lucas-nussbaum.net>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
=end

# debug mode
$updateddebug = false

# This class manages a cache of items
# (items which have already been seen)

require 'digest/md5'

class ItemCache
  def initialize(debug = false)
    @channels = {}
    @@cacheidx = 0
    $updateddebug = debug
    self
  end

  # Returns the really new items amongst items
  def get_new_items(id, items, always_new = false, ignore_hash = false)
    if $updateddebug
      puts "======================================================="
      puts "GET_NEW_ITEMS FOR #{id}... (#{Time::now})"
    end
    @channels[id] ||= CachedChannel::new
    @channels[id].parsefailures = 0
    return @channels[id].get_new_items(items, always_new, ignore_hash)
  end

  # Commit changes to the cache
  def commit_cache(id)
    @channels[id] ||= CachedChannel::new
    @channels[id].commit
  end

  # Get the last time the cache was updated
  def get_last_check(id)
    @channels[id] ||= CachedChannel::new
    @channels[id].lastcheck
  end

  # Get the last time the cache was updated
  def set_last_check(id, time)
    @channels[id] ||= CachedChannel::new
    @channels[id].lastcheck = time
    @channels[id].failures = 0
    self
  end

  # Fetching failure.
  # returns number of failures
  def fetch_failed(id)
    @channels[id].fetch_failed
  end

  # Parsing failure.
  # returns number of failures
  def parse_failed(id)
    @channels[id].parse_failed
  end

  # Load the cache from an IO stream
  def load(io)
    begin
      @@cacheidx, @channels = Marshal.load(io)
    rescue
      @channels = Marshal.load(io)
      @@cacheidx = 0
    end
  end

  # Save the cache to an IO stream
  def save(io)
    Marshal.dump([@@cacheidx, @channels], io)
  end
  
  # Return the number of channels in the cache
  def nbchannels
    @channels.length
  end

  # Return the number of items in the cache
  def nbitems
    nb = 0
    @channels.each_value { |c|
      nb += c.nbitems
    }
    nb
  end

  def ItemCache.getindex
    i = @@cacheidx
    @@cacheidx += 1
    i
  end
end

class CachedChannel
  # Size of the cache for each feed
  # 100 items should be enough for everybody, even quite busy feeds
  CACHESIZE = 100

  attr_accessor :lastcheck, :items, :failures, :parsefailures

  def initialize
    @lastcheck = Time::at(0)
    @items = []
    @itemstemp = [] # see below
    @nbnewitems = 0
    @failures = 0
    @parsefailures = 0
  end

  # Let's explain @items and @itemstemp.
  # @items contains the CachedItems serialized to the disk cache.
  # The - quite complicated - get_new_items method fills in @itemstemp
  # but leaves @items unchanged.
  # Later, the commit() method replaces @items with @itemstemp and
  # empties @itemstemp. This way, if something wrong happens during the
  # upload to the IMAP server, items aren't lost.
  # @nbnewitems is set by get_new_items, and is used to limit the number
  # of (old) items serialized.

  # Returns the really new items amongst items
  def get_new_items(items, always_new = false, ignore_hash = false)
    # save number of new items
    @nbnewitems = items.length
    # set items' cached version if not set yet
    newitems = []
    updateditems = []
    @itemstemp = @items
    items.each { |i| i.cacheditem ||= CachedItem::new(i) }
    if $updateddebug
      puts "-------Items downloaded before dups removal (#{items.length}) :----------"
      items.each { |i| puts "#{i.cacheditem.to_s}" }
    end
    # remove dups
    dups = true
    while dups
      dups = false
      for i in 0...items.length do
        for j in i+1...items.length do
          if items[i].cacheditem == items[j].cacheditem
            if $updateddebug
              puts "## Removed duplicate #{items[j].cacheditem.to_s}"
            end
            items.delete_at(j)
            dups = true
            break
          end
        end
        break if dups
      end
    end
    # debug : dump interesting info to stdout.
    if $updateddebug
      puts "-------Items downloaded after dups removal (#{items.length}) :----------"
      items.each { |i| puts "#{i.cacheditem.to_s}" }
      puts "-------Items already there (#{@items.length}) :----------"
      @items.each { |i| puts "#{i.to_s}" }
      puts "Items always considered as new: #{always_new.to_s}"
      puts "Items compared ignoring the hash: #{ignore_hash.to_s}"
    end
    items.each do |i|
      found = false
      # Try to find a perfect match
      @items.each do |j|
        # note that simple_compare only CachedItem, not RSSItem, so we have to use
        # j.simple_compare(i) and not i.simple_compare(j)
        if (i.cacheditem == j and not ignore_hash) or
            (j.simple_compare(i) and ignore_hash)
          i.cacheditem.index = j.index
          found = true
          # let's put j in front of itemstemp
          @itemstemp.delete(j)
          @itemstemp.unshift(j)
          break
        end
      end
      next if found
      if not always_new
        # Try to find an updated item
        @items.each do |j|
          # Do we need a better heuristic ?
          if j.is_ancestor_of(i)
            i.cacheditem.index = j.index
            i.cacheditem.updated = true
            updateditems.push(i)
            found = true
            # let's put j in front of itemstemp
            @itemstemp.delete(j)
            @itemstemp.unshift(i.cacheditem)
            break
          end
        end
      end
      next if found
      # add as new
      i.cacheditem.create_index
      newitems.push(i)
      # add i.cacheditem to @itemstemp
      @itemstemp.unshift(i.cacheditem)
    end
    if $updateddebug
      puts "-------New items :----------"
      newitems.each { |i| puts "#{i.cacheditem.to_s}" }
      puts "-------Updated items :----------"
      updateditems.each { |i| puts "#{i.cacheditem.to_s}" }
    end
    return [newitems, updateditems]
  end
  
  def commit
    # too old items must be dropped
    n = @nbnewitems > CACHESIZE ? @nbnewitems : CACHESIZE
    @items = @itemstemp[0..n]
    if $updateddebug
      puts "Committing: new items: #{@nbnewitems} / items kept: #{@items.length}"
    end
    @itemstemp = []
    self
  end

  # returns the number of items
  def nbitems
    @items.length
  end

  def parse_failed
    @parsefailures = 0 if @parsefailures.nil?
    @parsefailures += 1
    return @parsefailures
  end

  def fetch_failed
    @failures = 0 if @failures.nil?
    @failures += 1
    return @failures
  end
end

# This class is the only thing kept in the cache
class CachedItem
  attr_reader :title, :link, :creator, :date, :hash
  attr_accessor :index
  attr_accessor :updated

  def initialize(item)
    @title = item.title
    @link = item.link
    @date = item.date
    @creator = item.creator
    if item.content.nil?
      @hash = nil
    else
      @hash = Digest::MD5.hexdigest(item.content.to_s)
    end
  end

  def ==(other)
    if $updateddebug
      puts "Comparing #{self.to_s} and #{other.to_s}:"
      puts "Title: #{@title == other.title}"
      puts "Link: #{@link == other.link}"
      puts "Creator: #{@creator == other.creator}"
      puts "Date: #{@date == other.date}"
      puts "Hash: #{@hash == other.hash}"
    end
    @title == other.title and @link == other.link and
        (@creator.nil? or other.creator.nil? or @creator == other.creator) and
	(@date.nil? or other.date.nil? or @date == other.date) and @hash == other.hash
  end

  def simple_compare(other)
    @title == other.title and @link == other.link and
        (@creator.nil? or other.creator.nil? or @creator == other.creator) 
  end

  def create_index
    @index = ItemCache.getindex
  end

  def is_ancestor_of(other)
    (@link and other.link and @link == other.link) and
      ((@creator and other.creator and @creator == other.creator) or (@creator.nil?))
  end

  def to_s
    "\"#{@title}\" #{@creator}/#{@date} #{@link} #{@hash}"
  end
end