import urllib
import re
import time
import textwrap
import random
import aphorisms

from HTMLParser import HTMLParser
from pratchett  import get_pratchett_quotes
from aphorisms  import get_aphorismsgalore_sigs

def max_line_length(quote):
  """Returns length of longest line in quote."""
  lines = quote.split("\n")
  if not lines: return 0
  return max([len(L) for L in lines])


def attrib_line( quote, attrib ):
  """Returns attrib with some indentation before it for prettiness."""
  indent = 4
  max_indent = 80-len(attrib)
  best_indent = max_line_length(quote)/4
  if best_indent > max_indent: best_indent = max_indent
  if indent < best_indent: indent = best_indent
  return " "*indent + attrib


def read_typos():
  """Reads a list of typos from sig_typos.txt.

  sig_typos.txt is a list of "**"-separated pairs, which are
  whitespace-stripped at either end.  Comment lines start "#".
  """
  typos = []
  for line in file( "sig_typos.txt" ):
    line = line.strip()
    if not line or line.startswith("#"): continue
    bits = line.split("**")
    if len(bits) != 2: continue
    typos.append( (bits[0].strip(), bits[1].strip()) )
  return typos


def fix_typos( text ):
  assert type(text) is type("")
  typos = read_typos()

  for typo,fixed in typos:
    if typo[0]=="!": # regexp
      text, num_matches = re.subn( typo[1:], fixed, text )
    else:
      text, num_matches = re.subn( re.escape(typo), fixed, text )
    #if num_matches > 0: print "****"+typo+"****"

  return text


def get_themanwhofellasleep_quotes():
  """Downloads a list of quotes from themanwhofellasleep.com."""

  urls = ( "http://themanwhofellasleep.com/gossip.html",
           "http://themanwhofellasleep.com/gossip2.html",
         )

  quote_re = re.compile( r">(?:[1-9]|10)\.\s*((?:[^<\s]+\s+)*[^<\s]+)\s*</font" )
    # This returns "blah blah blah" from ">3. blah blah blah </font".
    # It's not perfect, but it's close enough.

  short_ellipsis_re = re.compile( r"(?<=[^.])\.\.(?=\s*[A-Za-z])" ) # pair of dots
  long_ellipsis_re = re.compile( r"\.\.\.\.+" ) # four dots
  ellipsis_lc_re = re.compile( r"\.\.\.\ *(?=[a-z])" ) # ("...",any " "s);lowercase
  ellipsis_uc_re = re.compile( r"\.\.\.\ +(?=[^a-z])" ) # ("...",any " "s);not lowercase
  amp_quot_re = re.compile( "&quot;" )

  # Download and parse the webpages listed in urls.
  quotes = []
  for url in urls:
    # Download the webpage in one easy line!
    webpage = urllib.urlopen(url).read()
    #webpage = open( url.split("/")[-1] ).read() # debug

    # Extract the quotes...
    quotes_on_this_page = quote_re.findall(webpage)

    # Fix typos
    quotes_on_this_page = [ fix_typos(q) for q in quotes_on_this_page ]

    for quote in quotes_on_this_page:
      qs = '"' # Quotation marks with which to surround whole quote.
      # Re-format them to look like a sig...
      quote = short_ellipsis_re.subn( "...", quote )[0] # fix "blah.. blah"
      quote = long_ellipsis_re.subn( "...", quote )[0] # fix "blah.... blah"
      quote = "... ".join(quote.split("...")) # ensure ellipses end sentences.
      quote, num_matches = amp_quot_re.subn( '"', quote )
      if num_matches > 0: qs = "'"
      quote = " ".join(quote.split()) # turn whitespace into single-spaces.
      quote = textwrap.fill(quote, 77, fix_sentence_endings=1) # paragraph formatting.
      quote = "\n ".join(quote.split("\n")) # Indent non-first lines by one space.
      quote = ellipsis_lc_re.subn( "... ", quote )[0] # single space after mid-sentence "..."
      quote = ellipsis_uc_re.subn( "...  ", quote )[0] # double space after end-of-sentence "..."

      # ... and add them to the list of quotes.
      quotes.append( qs + quote + qs + '\n' + attrib_line(quote,'-- ' + url) )

  #for q in quotes: print ">>"+q+"<<"
  return quotes


def run_infinite_loop_serving_quotes( quotes ):
  """Loop, outputting one sig every n seconds."""

  while 1:
    # We fix the typos again in case the typos have changed.
    # If they have, we might need to re-validate the quote in theory,
    # but in practice, if I've spotted a typo, the correction will
    # probably only improve it.  If that typo occurs again... I don't care.
    quote = random.choice(quotes)
    open(".signature","w").write( "Tom\n-- \n" + fix_typos(quote) + "\n" )
    time.sleep(5)


def validate_quotes( quotes ):
  done = []   # list of already-handled quotes, avoids duplicate warnings
  for quote in quotes:
    if quote in done: continue
    done.append( quote )
    lines = quote.split("\n")
    if len(lines) >= 5:
      print "warning, long quote: (%d lines)" % len(lines)
      print quote
      continue
    width = max_line_length(quote)
    if width>79:
      print "warning, wide quote: (%d chars)" % width
      print quote
      continue


def main():
  # Build the list of all the quotes

  theman_quotes    = get_themanwhofellasleep_quotes()
  pratchett_quotes = get_pratchett_quotes()

  print "Getting Aphorisms Galore quotes slowly..."
  aphorisms.use_slowdown = True
  aphorisms.print_progress = True
  ag_sigs = get_aphorismsgalore_sigs()
  print "done."
  print

  # Tweak the probabilities a bit by repeating the
  # Pratchett quotes a few times:

  pq_orig_len = len(pratchett_quotes)
  pq_repetitions = len(theman_quotes)/pq_orig_len
  pratchett_quotes *= pq_repetitions

  ag_orig_len = len(ag_sigs)
  ag_repetitions = len(theman_quotes)/ag_orig_len
  ag_sigs *= ag_repetitions

  print ( "Serving %d quotes from TheManWhoFellAsleep, "
          "%d(*%d=%d) from Terry Pratchett/Misc.,\n"
          "%d(*%d=%d) from `Aphorisms Galore!'"
              % (  len(theman_quotes),
                   pq_orig_len, pq_repetitions, len(pratchett_quotes),
                   ag_orig_len, ag_repetitions, len(ag_sigs),
                ))

  quotes = theman_quotes + pratchett_quotes + ag_sigs

  # Check that there are no quotes which are too long etc.
  validate_quotes( quotes )

  # Start serving.
  run_infinite_loop_serving_quotes( quotes )


if __name__=="__main__":
  main()

