Note

Gmail supports new operators for size range searching, see my blog post about them. (2012-11-14)

After I posted about using Googles Python XOAuth library to get the unread mail count and list, I finally found a good reason to use IMAP, you can search based on the message size! Which you cant do in the web interface.

typ, data = imap_conn.search(None, '(SMALLER %d) (LARGER %d)' % (MAXSIZE * 1000, MINSIZE * 1000))

That is just great but not awesome because Gmails IMAP server does not support SORT command, which is an IMAP4rev1 extension command, according to Python doc.

The entire source code is the similar to the one in my previous post:

#!/usr/bin/env python
# Copyright 2010 Yu-Jie Lin
# BSD license

import email
import email.header
import imaplib
import sys

# http://google-mail-xoauth-tools.googlecode.com/svn/trunk/python/xoauth.py
import xoauth

scope = 'https://mail.google.com/'
consumer = xoauth.OAuthEntity('anonymous', 'anonymous')
imap_hostname = 'imap.googlemail.com'

# How many messages will be fetched for listing?
MAX_FETCH = 20


try:
  import config
except ImportError:
  class Config():
    pass
  config = Config()


def get_access_token():

  request_token = xoauth.GenerateRequestToken(
      consumer, scope, nonce=None, timestamp=None,
      google_accounts_url_generator=config.google_accounts_url_generator
      )

  oauth_verifier = raw_input('Enter verification code: ').strip()
  try:
    access_token = xoauth.GetAccessToken(
        consumer, request_token, oauth_verifier, config.google_accounts_url_generator)
  except ValueError:
    # Could indicate failure of authentication because verifier is incorrect
    print 'Incorrect verification code?'
    sys.exit(1)
  return access_token


def main():

  # Checking user email and access token
  if not hasattr(config, 'user') or not hasattr(config, 'access_token'):
    config.user = raw_input('Please enter your email address: ')
    config.google_accounts_url_generator = xoauth.GoogleAccountsUrlGenerator(config.user)
    access_token = get_access_token()
    config.access_token = {'key': access_token.key, 'secret': access_token.secret}
    # XXX save token, this is not a good way, I'm too lazy to use something
    # like shelve.
    f = open('config.py', 'w')
    f.write('user = %s\n' % repr(config.user))
    f.write('access_token = %s\n' % repr(config.access_token))
    f.close()
    print '\n\nconfig.py written.\n\n'

  config.google_accounts_url_generator = xoauth.GoogleAccountsUrlGenerator(config.user)
  access_token = xoauth.OAuthEntity(config.access_token['key'], config.access_token['secret'])

  # Generate xoauth string
  class ImBad():
    # I'm bad because I'm going to shut xoauth's mouth up. So you won't see these debug messages:
    # signature base string:
    # GET&https%3A%2F%2Fmail.google.com%2Fmail%2Fb%2Flivibetter%40gmail.com%...
    #
    # xoauth string (before base64-encoding):
    # GET https://mail.google.com/mail/b/livibettergmail.com/IMAP/ oauth_co...
    def write(self, msg): pass
  sys.stdout = ImBad()
  xoauth_string = xoauth.GenerateXOauthString(
      consumer, access_token, config.user, 'IMAP',
      xoauth_requestor_id=None, nonce=None, timestamp=None)
  sys.stdout = sys.__stdout__

  MINSIZE = int(raw_input('Larger than in KB [1000]? ') or 1000)
  MAXSIZE = int(raw_input('Smaller than in KB [5000]? ') or 5000)
  if MAXSIZE < MINSIZE:
    print >> sys.stderr, 'Wrong size range!'
    sys.exit(1)
  print
  imap_conn = imaplib.IMAP4_SSL(imap_hostname)
  imap_conn.authenticate('XOAUTH', lambda x: xoauth_string)
  imap_conn.select('[Gmail]/All Mail', readonly=True)
  typ, data = imap_conn.search(None, '(SMALLER %d) (LARGER %d)' % (MAXSIZE * 1000, MINSIZE * 1000))
  # No SORT command on Gmail IMAP server
  #typ, data = imap_conn.sort('(REVERSE SIZE)', 'UTF-8', '(LARGER %d)' % SIZE)
  unreads = data[0].split()
  print '%d messages are between %d and %d KB.' % (len(unreads), MINSIZE, MAXSIZE)
  ids = ','.join(unreads[:MAX_FETCH])
  if ids:
    print
    print 'Listing %d messages:' % min(len(unreads), MAX_FETCH)
    typ, data = imap_conn.fetch(ids, '(RFC822.HEADER)')
    for item in data:
      if isinstance(item, tuple):
        raw_msg = item[1]
        msg = email.message_from_string(raw_msg)
        # Some email's header are encoded, for example: '=?UTF-8?B?...'
        print '\033[1;35m%s\033[0m: \033[1;32m%s\033[0m' % (
            email.header.decode_header(msg['from'])[0][0],
            email.header.decode_header(msg['subject'])[0][0],
            )
  imap_conn.close()
  imap_conn.logout()


if __name__ == '__main__':
  main()

The output would look like:

% python2.5 ./gmail-xoauth-find-large.py
Larger than in KB [1000]?
Smaller than in KB [5000]?

23 messages are between 1000 and 5000 KB.

Listing 20 messages:
[messages here]

The search would take quite a lot of time to complete, up to minutes. So, please be patient.

I want to find those big emails because I couldnt figure out why 9,085 emails can take up to 543 MB in my Gmail. I found the biggest mail, 15,189KB, 2.80% of used space. Second and third takes 9,366 and 7,659KB, together take 3.14%.