diff --git a/Rakefile b/Rakefile index a6a9b42..95fd2f8 100644 --- a/Rakefile +++ b/Rakefile @@ -1,13 +1,13 @@ require 'rake/testtask' -require 'rake/rdoctask' +require 'rdoc/task' require 'rake/packagetask' require 'rake' require 'find' -task :default => [:package] +task :default => [:test] PKG_NAME = 'feed2imap' -PKG_VERSION = '1.0' +PKG_VERSION = '1.1' PKG_FILES = [ 'ChangeLog', 'README', 'COPYING', 'setup.rb', 'Rakefile'] Find.find('bin/', 'lib/', 'test/', 'data/') do |f| if FileTest.directory?(f) and f =~ /\.svn/ @@ -19,10 +19,10 @@ Rake::TestTask.new do |t| t.libs << "libs/feed2imap" t.libs << "test" - t.test_files = FileList['test/tc_*.rb'] + t.test_files = FileList['test/tc_*.rb'] - ['test/tc_httpfetcher.rb'] end -Rake::RDocTask.new do |rd| +RDoc::Task.new do |rd| rd.main = 'README' rd.rdoc_files.include('lib/*.rb', 'lib/feed2imap/*.rb') rd.options << '--all' @@ -41,7 +41,7 @@ # "Gem" part of the Rakefile begin - require 'rake/gempackagetask' + require 'rubygems/package_task' spec = Gem::Specification.new do |s| s.platform = Gem::Platform::RUBY @@ -50,11 +50,15 @@ s.version = PKG_VERSION s.requirements << 'feedparser' s.require_path = 'lib' + s.executables = PKG_FILES.grep(%r{\Abin\/.}).map { |bin| + bin.gsub(%r{\Abin/}, '') + } s.files = PKG_FILES s.description = "RSS/Atom feed aggregator" + s.authors = ['Lucas Nussbaum'] end - Rake::GemPackageTask.new(spec) do |pkg| + Gem::PackageTask.new(spec) do |pkg| pkg.need_zip = true pkg.need_tar = true end diff --git a/data/doc/feed2imap/examples/feed2imaprc b/data/doc/feed2imap/examples/feed2imaprc index 4fd8797..7e93386 100644 --- a/data/doc/feed2imap/examples/feed2imaprc +++ b/data/doc/feed2imap/examples/feed2imaprc @@ -7,9 +7,12 @@ # debug-updated: (for debugging purposes) if true, display a lot of information # about the "updated-items" algorithm. # include-images: download images and include them in the mail? (true/false) +# reupload-if-updated: when an item is updated, and was previously deleted, +# reupload it? (true/false, default true) # default-email: default email address in the format foo@example.com # disable-ssl-verification: disable SSL certification when connecting # to IMAPS accounts (true/false) +# timeout: time before getting timeout when fetching feeds (default 30) in seconds # # Per-feed options: # name: name of the feed (must be unique) @@ -20,6 +23,8 @@ # feed will be fetched # disable: if set to something, the feed will be ignored # include-images: download images and include them in the mail? (true/false) +# reupload-if-updated: when an item is updated, and was previously deleted, +# reupload it? (true/false, default true) # always-new: feed2imap tries to use a clever algorithm to determine whether # an item is new or has been updated. It doesn't work well with some web apps # like mediawiki. When this flag is enabled, all items which don't match @@ -63,3 +68,5 @@ # - name: test2 # target: [ *target, 'test2' ] # ... + +# vim: ft=yaml:sts=2:expandtab diff --git a/lib/feed2imap/cache.rb b/lib/feed2imap/cache.rb index 6dc0ddf..9fb9930 100644 --- a/lib/feed2imap/cache.rb +++ b/lib/feed2imap/cache.rb @@ -193,22 +193,16 @@ @itemstemp.unshift(j) break end - end - next if found - if not always_new - # Try to find an updated item - @items.each do |j| - # Do we need a better heuristic ? - if j.is_ancestor_of(i) - i.cacheditem.index = j.index - i.cacheditem.updated = true - updateditems.push(i) - found = true - # let's put j in front of itemstemp - @itemstemp.delete(j) - @itemstemp.unshift(i.cacheditem) - break - end + # If we didn't find exact match, try to check if we have an update + if j.is_ancestor_of(i) + i.cacheditem.index = j.index + i.cacheditem.updated = true + updateditems.push(i) + found = true + # let's put j in front of itemstemp + @itemstemp.delete(j) + @itemstemp.unshift(i.cacheditem) + break end end next if found diff --git a/lib/feed2imap/config.rb b/lib/feed2imap/config.rb index 53e6543..f478296 100644 --- a/lib/feed2imap/config.rb +++ b/lib/feed2imap/config.rb @@ -23,6 +23,7 @@ require 'feed2imap/maildir' require 'etc' require 'socket' +require 'set' # Default cache file DEFCACHE = ENV['HOME'] + '/.feed2imap.cache' @@ -33,7 +34,7 @@ # Feed2imap configuration class F2IConfig - attr_reader :imap_accounts, :cache, :feeds, :dumpdir, :updateddebug, :max_failures, :include_images, :default_email, :hostname + attr_reader :imap_accounts, :cache, :feeds, :dumpdir, :updateddebug, :max_failures, :include_images, :default_email, :hostname, :reupload_if_updated, :parts, :timeout # Load the configuration from the IO stream # TODO should do some sanity check on the data read. @@ -44,10 +45,25 @@ @conf['feeds'] ||= [] @feeds = [] @max_failures = (@conf['max-failures'] || 10).to_i - @updateddebug = (@conf['debug-updated'] and @conf['debug-updated'] != 'false') - @include_images = (@conf['include-images'] and @conf['include-images'] != 'false') + + @updateddebug = false + @updateddebug = @conf['debug-updated'] if @conf.has_key?('debug-updated') + + @parts = %w(text html) + @parts = Array(@conf['parts']) if @conf.has_key?('parts') && !@conf['parts'].empty? + @parts = Set.new(@parts) + + @include_images = true + @include_images = @conf['include-images'] if @conf.has_key?('include-images') + @parts << 'html' if @include_images && ! @parts.include?('html') + + @reupload_if_updated = true + @reupload_if_updated = @conf['reupload-if-updated'] if @conf.has_key?('reupload-if-updated') + + @timeout = if @conf['timeout'] == nil then 30 else @conf['timeout'].to_i end + @default_email = (@conf['default-email'] || "#{LOGNAME}@#{HOSTNAME}") - ImapAccount.no_ssl_verify = (@conf['disable-ssl-verification'] and @conf['disable-ssl-verification'] != 'false') + ImapAccount.no_ssl_verify = (@conf.has_key?('disable-ssl-verification') and @conf['disable-ssl-verification'] == true) @hostname = HOSTNAME # FIXME: should this be configurable as well? @imap_accounts = ImapAccounts::new maildir_account = MaildirAccount::new @@ -55,10 +71,11 @@ if f['disable'].nil? uri = URI::parse(f['target'].to_s) path = URI::unescape(uri.path) - path = path[1..-1] if path[0,1] == '/' if uri.scheme == 'maildir' @feeds.push(ConfigFeed::new(f, maildir_account, path, self)) else + # remove leading slash from IMAP mailbox names + path = path[1..-1] if path[0,1] == '/' @feeds.push(ConfigFeed::new(f, @imap_accounts.add_account(uri), path, self)) end end @@ -94,30 +111,55 @@ # A configured feed. simple data container. class ConfigFeed - attr_reader :name, :url, :imapaccount, :folder, :always_new, :execurl, :filter, :ignore_hash, :dumpdir, :wrapto, :include_images + attr_reader :name, :url, :imapaccount, :folder, :always_new, :execurl, :filter, :ignore_hash, :dumpdir, :wrapto, :include_images, :reupload_if_updated attr_accessor :body def initialize(f, imapaccount, folder, f2iconfig) @name = f['name'] @url = f['url'] @url.sub!(/^feed:/, '') if @url =~ /^feed:/ - @imapaccount, @folder = imapaccount, folder + @imapaccount = imapaccount + @folder = encode_utf7 folder @freq = f['min-frequency'] - @always_new = (f['always-new'] and f['always-new'] != 'false') + + @always_new = false + @always_new = f['always-new'] if f.has_key?('always-new') + @execurl = f['execurl'] @filter = f['filter'] - @ignore_hash = f['ignore-hash'] || false + + @ignore_hash = false + @ignore_hash = f['ignore-hash'] if f.has_key?('ignore-hash') + @freq = @freq.to_i if @freq @dumpdir = f['dumpdir'] || nil @wrapto = if f['wrapto'] == nil then 72 else f['wrapto'].to_i end + @include_images = f2iconfig.include_images - if f['include-images'] - @include_images = (f['include-images'] != 'false') - end + @include_images = f['include-images'] if f.has_key?('include-images') + + @reupload_if_updated = f2iconfig.reupload_if_updated + @reupload_if_updated = f['reupload-if-updated'] if f.has_key?('reupload-if-updated') + end def needfetch(lastcheck) return true if @freq.nil? return (lastcheck + @freq * 3600) < Time::now end + + def encode_utf7(s) + if "foo".respond_to?(:force_encoding) + return Net::IMAP::encode_utf7 s + else + # this is a copy of the Net::IMAP::encode_utf7 w/o the force_encoding + return s.gsub(/(&)|([^\x20-\x7e]+)/u) { + if $1 + "&-" + else + base64 = [$&.unpack("U*").pack("n*")].pack("m") + "&" + base64.delete("=\n").tr("/", ",") + "-" + end } + end + end end diff --git a/lib/feed2imap/feed2imap.rb b/lib/feed2imap/feed2imap.rb index fb4c794..f1bc9cd 100644 --- a/lib/feed2imap/feed2imap.rb +++ b/lib/feed2imap/feed2imap.rb @@ -121,7 +121,9 @@ end fetch_start = Time::now if feed.url - s = HTTPFetcher::fetch(feed.url, @cache.get_last_check(feed.name)) + fetcher = HTTPFetcher::new + fetcher::timeout = @config.timeout + s = fetcher::fetch(feed.url, @cache.get_last_check(feed.name)) elsif feed.execurl # avoid running more than one command at the same time. # We need it because the called command might not be @@ -221,7 +223,7 @@ next end begin - feed = FeedParser::Feed::new(f.body) + feed = FeedParser::Feed::new(f.body.force_encoding('UTF-8')) rescue Exception n = @cache.parse_failed(f.name) m = "Error while parsing #{f.name}: #{$!} (failed #{n} times)" @@ -247,7 +249,7 @@ id = "<#{fn}-#{i.cacheditem.index}@#{@config.hostname}>" email = item_to_mail(@config, i, id, true, f.name, f.include_images, f.wrapto) f.imapaccount.updatemail(f.folder, email, - id, i.date || Time::new) + id, i.date || Time::new, f.reupload_if_updated) end # reverse is needed to upload older items first (fixes gna#8986) newitems.reverse.each do |i| diff --git a/lib/feed2imap/httpfetcher.rb b/lib/feed2imap/httpfetcher.rb index 2438994..6734465 100644 --- a/lib/feed2imap/httpfetcher.rb +++ b/lib/feed2imap/httpfetcher.rb @@ -17,6 +17,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA =end +require 'zlib' require 'net/http' # get openssl if available begin @@ -33,7 +34,14 @@ # Class used to retrieve the feed over HTTP class HTTPFetcher - def HTTPFetcher::fetcher(baseuri, uri, lastcheck, recursion) + + @timeout = 30 # should be enough for everybody... + + def timeout=(value) + @timeout = value + end + + def fetcher(baseuri, uri, lastcheck, recursion) proxy_host = nil proxy_port = nil proxy_user = nil @@ -49,8 +57,8 @@ proxy_port, proxy_user, proxy_pass ).new(uri.host, uri.port) - http.read_timeout = 30 # should be enough for everybody... - http.open_timeout = 30 + http.read_timeout = @timeout + http.open_timeout = @timeout if uri.scheme == 'https' http.use_ssl = true http.verify_mode = OpenSSL::SSL::VERIFY_NONE @@ -61,11 +69,14 @@ useragent = 'Feed2Imap http://home.gna.org/feed2imap/' end - if lastcheck == Time::at(0) - req = Net::HTTP::Get::new(uri.request_uri, {'User-Agent' => useragent }) - else - req = Net::HTTP::Get::new(uri.request_uri, {'User-Agent' => useragent, 'If-Modified-Since' => lastcheck.httpdate}) + headers = { + 'User-Agent' => useragent, + 'Accept-Encoding' => 'gzip', + } + if lastcheck != Time::at(0) + headers.merge!('If-Modified-Since' => lastcheck.httpdate) end + req = Net::HTTP::Get::new(uri.request_uri, headers) if uri.userinfo login, pw = uri.userinfo.split(':') req.basic_auth(login, pw) @@ -81,7 +92,12 @@ end case response when Net::HTTPSuccess - return response.body + case response['Content-Encoding'] + when 'gzip' + return Zlib::GzipReader.new(StringIO.new(response.body)).read + else + return response.body + end when Net::HTTPRedirection # if not modified if Net::HTTPNotModified === response @@ -99,8 +115,8 @@ end end - def HTTPFetcher::fetch(url, lastcheck) + def fetch(url, lastcheck) uri = URI::parse(url) - return HTTPFetcher::fetcher(uri, uri, lastcheck, MAXREDIR) + return fetcher(uri, uri, lastcheck, MAXREDIR) end end diff --git a/lib/feed2imap/imap.rb b/lib/feed2imap/imap.rb index c4e7106..4c54456 100644 --- a/lib/feed2imap/imap.rb +++ b/lib/feed2imap/imap.rb @@ -108,7 +108,7 @@ end # update a mail - def updatemail(folder, mail, id, date = Time::now) + def updatemail(folder, mail, id, date = Time::now, reupload_if_updated = true) create_folder_if_not_exists(folder) @connection.select(folder) searchres = @connection.search(['HEADER', 'Message-Id', id]) @@ -119,6 +119,9 @@ searchres.each { |m| @connection.store(m, "+FLAGS", [:Deleted]) } @connection.expunge flags -= [ :Recent ] # avoids errors with dovecot + elsif not reupload_if_updated + # mail not present, and we don't want to re-upload it + return end @connection.append(folder, mail.gsub(/\n/, "\r\n"), flags, date) end @@ -141,7 +144,9 @@ d = f[0].attr['INTERNALDATE'] s = f[0].attr['ENVELOPE'].subject if s =~ /^=\?utf-8\?b\?/ - s = Base64::decode64(s.gsub(/^=\?utf-8\?b\?(.*)\?=$/, '\1')).toISO_8859_1('utf-8') + s = Base64::decode64(s.gsub(/^=\?utf-8\?b\?(.*)\?=$/, '\1')).force_encoding('utf-8') + elsif s =~ /^=\?iso-8859-1\?b\?/ + s = Base64::decode64(s.gsub(/^=\?iso-8859-1\?b\?(.*)\?=$/, '\1')).force_encoding('iso-8859-1').encode('utf-8') end if dryrun puts "To remove: #{s} (#{d})" diff --git a/lib/feed2imap/itemtomail.rb b/lib/feed2imap/itemtomail.rb index 6c70cbd..fcf8f8b 100644 --- a/lib/feed2imap/itemtomail.rb +++ b/lib/feed2imap/itemtomail.rb @@ -77,14 +77,20 @@ message.header['Subject'] = subj end end - textpart = RMail::Message::new - textpart.header['Content-Type'] = 'text/plain; charset=utf-8; format=flowed' - textpart.header['Content-Transfer-Encoding'] = '8bit' - textpart.body = item.to_text(true, wrapto, false) - htmlpart = RMail::Message::new - htmlpart.header['Content-Type'] = 'text/html; charset=utf-8' - htmlpart.header['Content-Transfer-Encoding'] = '8bit' - htmlpart.body = item.to_html + textpart = htmlpart = nil + parts = config.parts + if parts.include?('text') + textpart = parts.size == 1 ? message : RMail::Message::new + textpart.header['Content-Type'] = 'text/plain; charset=utf-8; format=flowed' + textpart.header['Content-Transfer-Encoding'] = '8bit' + textpart.body = item.to_text(true, wrapto, false) + end + if parts.include?('html') + htmlpart = parts.size == 1 ? message : RMail::Message::new + htmlpart.header['Content-Type'] = 'text/html; charset=utf-8' + htmlpart.header['Content-Transfer-Encoding'] = '8bit' + htmlpart.body = item.to_html + end # inline images as attachments imgs = [] @@ -127,7 +133,7 @@ imgs.each do |i| message.add_part(i) end - else + elsif parts.size != 1 message.header['Content-Type'] = 'multipart/alternative' message.add_part(textpart) message.add_part(htmlpart) diff --git a/lib/feed2imap/maildir.rb b/lib/feed2imap/maildir.rb index a18e5aa..8a5ade1 100644 --- a/lib/feed2imap/maildir.rb +++ b/lib/feed2imap/maildir.rb @@ -19,9 +19,13 @@ require 'uri' require 'fileutils' require 'fcntl' +require 'rmail' +require 'socket' class MaildirAccount MYHOSTNAME = Socket.gethostname + + @@seq_num = 0 attr_reader :uri @@ -31,7 +35,7 @@ end end - def updatemail(folder, mail, idx, date = Time::now) + def updatemail(folder, mail, idx, date = Time::now, reupload_if_updated = true) dir = folder_dir(folder) guarantee_maildir(dir) mail_files = find_mails(dir, idx) @@ -40,6 +44,9 @@ # get the info from the first result and delete everything info = maildir_file_info(mail_files[0]) mail_files.each { |f| File.delete(File.join(dir, f)) } + elsif not reupload_if_updated + # mail not present, and we don't want to re-upload it + return end store_message(dir, date, info) { |f| f.puts(mail) } end @@ -62,9 +69,10 @@ next if (not flags.index('S') or flags.index('F') or mtime > recent_time) - File.open(fn) do |f| - mail = RMail::Parser.read(f) + mail = File.open(fn) do |f| + RMail::Parser.read(f) end + subject = mail.header['Subject'] if dryrun puts "To remove: #{subject} #{mtime}" else @@ -84,7 +92,6 @@ end def store_message(dir, date, info, &block) - # TODO: handle `date' guarantee_maildir(dir) @@ -93,7 +100,7 @@ timer = 30 fd = nil while timer >= 0 - new_fn = new_maildir_basefn + new_fn = new_maildir_basefn(date) tmp_path = File.join(dir, 'tmp', new_fn) new_path = File.join(dir, 'new', new_fn) begin @@ -137,9 +144,10 @@ Dir[File.join(subdir, '*')].each do |fn| File.open(fn) do |f| mail = RMail::Parser.read(f) - cache_index = mail.header['Message-Id'] - next if not (cache_index and cache_index == idx) - dir_paths.push(File.join(d, File.basename(fn))) + cache_index = mail.header['Message-ID'] + if cache_index && (cache_index == idx || cache_index == "<#{idx}>") + dir_paths.push(File.join(d, File.basename(fn))) + end end end end @@ -157,14 +165,25 @@ basename = File.basename(file) colon = basename.rindex(':') - return (colon and basename.slice(colon + 1, -1)) + return (colon and basename[colon + 1 .. -1]) end - # Shamelessly taken from + # Re-written and no longer shamelessly taken from # http://gitorious.org/sup/mainline/blobs/master/lib/sup/maildir.rb - def new_maildir_basefn - Kernel::srand() - "#{Time.now.to_i.to_s}.#{$$}#{Kernel.rand(1000000)}.#{MYHOSTNAME}" + def new_maildir_basefn(date) + fn = "#{date.to_i.to_s}.#{@@seq_num.to_s}.#{MYHOSTNAME}" + @@seq_num += 1 + fn end + + def maildir_file_info_flags(fn) + parts = fn.split(',') + if parts.size == 1 + '' + else + parts.last + end + end + end diff --git a/lib/feed2imap/rexml_patch.rb b/lib/feed2imap/rexml_patch.rb index f991090..7016127 100644 --- a/lib/feed2imap/rexml_patch.rb +++ b/lib/feed2imap/rexml_patch.rb @@ -26,7 +26,7 @@ module REXML module Encoding def decode(str) - return str.toUTF8(@encoding) + return str.encode(@encoding) end def encode(str) diff --git a/test/maildir/cur/1376317520.15784_1.debian:2,S b/test/maildir/cur/1376317520.15784_1.debian:2,S new file mode 100644 index 0000000..fc6aaab --- /dev/null +++ b/test/maildir/cur/1376317520.15784_1.debian:2,S @@ -0,0 +1,11 @@ +Date: Mon, 12 Aug 2013 16:25:20 +0200 +From: Antonio Terceiro +To: terceiro@debian.org +Subject: UTF-8 data: =?iso-8859-1?B?4ent8/o=?= +Message-ID: +MIME-Version: 1.0 +Content-Type: text/plain; charset=us-ascii +Content-Disposition: inline +User-Agent: Mutt/1.5.21 (2010-09-15) + +This is a sample email diff --git a/test/maildir/cur/1376319137.17850_1.debian:2, b/test/maildir/cur/1376319137.17850_1.debian:2, new file mode 100644 index 0000000..0bff46a --- /dev/null +++ b/test/maildir/cur/1376319137.17850_1.debian:2, @@ -0,0 +1,11 @@ +Date: Mon, 12 Aug 2013 16:52:17 +0200 +From: Antonio Terceiro +To: terceiro@debian.org +Subject: an unread message +Message-ID: +MIME-Version: 1.0 +Content-Type: text/plain; charset=us-ascii +Content-Disposition: inline +User-Agent: Mutt/1.5.21 (2010-09-15) + +This message was not read yet diff --git a/test/maildir/cur/1376320022.18396_5.debian:2,FS b/test/maildir/cur/1376320022.18396_5.debian:2,FS new file mode 100644 index 0000000..2547416 --- /dev/null +++ b/test/maildir/cur/1376320022.18396_5.debian:2,FS @@ -0,0 +1,11 @@ +Date: Mon, 12 Aug 2013 17:07:02 +0200 +From: Antonio Terceiro +To: terceiro@debian.org +Subject: a flagged message +Message-ID: +MIME-Version: 1.0 +Content-Type: text/plain; charset=us-ascii +Content-Disposition: inline +User-Agent: Mutt/1.5.21 (2010-09-15) + +This message is flagged. diff --git a/test/maildir/new/1376320099.18396_7.debian b/test/maildir/new/1376320099.18396_7.debian new file mode 100644 index 0000000..ba54ddd --- /dev/null +++ b/test/maildir/new/1376320099.18396_7.debian @@ -0,0 +1,11 @@ +Date: Mon, 12 Aug 2013 17:08:19 +0200 +From: Antonio Terceiro +To: terceiro@debian.org +Subject: a new message +Message-ID: +MIME-Version: 1.0 +Content-Type: text/plain; charset=us-ascii +Content-Disposition: inline +User-Agent: Mutt/1.5.21 (2010-09-15) + +This message is new diff --git a/test/tc_config.rb b/test/tc_config.rb index ce910e9..ad4877c 100755 --- a/test/tc_config.rb +++ b/test/tc_config.rb @@ -34,6 +34,17 @@ url: http://something2 target: imaps://login:pasword@ezaezae/Feeds/B EOF +CONFPARTS = <