#!/usr/bin/python2.2
###
# haiku - finds accidental haiku in free text
###
# $Id: haiku,v 1.2 2002/03/11 04:09:01 danny Exp $
###
# 
# Based on idea (and, sadly, lost implementation) by Don Marti.
# Requires the c06d file from  http://www.speech.cs.cmu.edu/cgi-bin/cmudict
#
# This code lives at http://www.oblomovka.com/code/ - please check there
# for full documentation, and latest versions.
#
#     Copyright 2002 Danny O'Brien <danny@spesh.com>
# 
#     This program is free software; you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation; either version 2 of the License, or
#     (at your option) any later version.
# 
#     This program is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.
# 
#     You should have be able to view the GNU General Public License at 
#     http://www.gnu.org/copyleft/gpl.html ; if not, write to the Free Software
#     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
#     "One person gains one
#     dollar by destroying two
#     dollars' worth of wealth."

from __future__ import generators
import sys, os, re, fileinput, getopt

def get_sy(word):
    """Return of syllables in word/syllable tuple"""
    return word[1]

def get_word(word):
    """Return word text from word/syllable tuple"""
    return word[0]
    
sy_dict = {}
def get_sy_count(word):
    """Return the number of syllables in word, 0 if word not recognised """
    if word in sy_dict:     # memoize as we go
        return sy_dict[word]
    uword = re.sub('[^A-Z\']','',word.upper())
    if (uword == ''): return 0
    cdu = re.split(r'\s+', os.popen('/bin/egrep "^%s " %s' % (uword,cmudict)).readline(), 1)
    if (cdu[0] != uword):
        return 0
    sy_count = len(re.findall("\d+", cdu[1]))
    sy_dict[word] = sy_count
    if (sy_count == 0): print "Couldn't find %s" % uword
    return sy_count
    
def generate_sy_file():
    """Return a generator that iterates through file, returning word and
    syllable count tuples"""
    text = fileinput.input()
    while 1:
        l = text.readline()
        if not l: break
        w = re.split(r'\s+',l)
        for word in w:
            sy = get_sy_count(word)
            if (sy == 0):
                continue 
            yield (word,sy)
    return
   
def pp_haiku(list):
    """Pretty-print a list made up lists of lists of word syllable pairs as a
    human-readable haiku.  Also roughly checks to see if haiku starts at
    beginning of sentence and ends at end, and flags these as 'good' haiku"""
    # does it end with a full stop? Did we *want* it to end with a fullstop?
    if ('-s' in options and not get_word(list[-1][-1])[-1] == '.'):
        return
    if ('-c' in options and not re.match('[A-Z]',get_word(list[0][0])[0])):
        return
    for a in list:
        for b in a:
            print get_word(b),
        print 
    print

prog_name = re.sub(".*/", "", sys.argv[0])
cmudict = './c06d'
version = 0.02

usage = """haiku %s
usage: %s [ -c ] [ -s ] file...

Finds haiku in text files. Needs the %s file from 
http://www.speech.cs.cmu.edu/cgi-bin/cmudict

-c  List only haiku that begin with a capital letter
-s  List only haiku that end with a fullstop

""" % (version, prog_name,cmudict)
try:
   opts, newargv = getopt.getopt(sys.argv[1:], 'chs', ['help'])
except getopt.GetoptError:
   print usage
   raise SystemExit

sys.argv=sys.argv[0:1] + newargv
options={}
for (a,b) in opts:
    options[a]=b

if ('-h' in options or '--help' in options):
    print usage
    raise SystemExit

haiku_form = ( 5, 7, 5 )

sy_file = generate_sy_file() # iterator for walking through file
window = []                  # window into file
i = 0                        # index into window

full_haiku = []              # haiku composed so far

haiku_index = 0              # index into file window

# build one haiku line
while 1:                  
    haiku_line = []
    sy_count = 0
    sy_max = haiku_form[haiku_index]
    while (sy_count < sy_max): # suck up enough syllables
        try:
            sy_count = sy_count + get_sy(window[i])
        except IndexError:     # silently grab new words from file
            try:
                window.append(sy_file.next())
            except StopIteration: 
                break
            sy_count = sy_count + get_sy(window[i])
        haiku_line.append(window[i])
        i += 1
    if (sy_count == sy_max):   # have haiku line?
        full_haiku.append(haiku_line)
        haiku_index += 1
        if (haiku_index == len(haiku_form)): # got full haiku?
            pp_haiku(full_haiku)
            full_haiku = []
            haiku_index = 0
        continue

    # haiku aborted: backtrack to beginning, but start with next word in file
    full_haiku = []
    haiku_index = 0
    i = 0
    while 1:
        try:
            window.pop(0)
        except IndexError: 
             raise SystemExit
        if (not window):
            try:
                window.append(sy_file.next())
            except StopIteration: 
               raise SystemExit 
        # if we're looking for capital letter haikus, then skip lcase ones
        if (('-c' not in options) or (re.match('[A-Z]',window[0][0]))):
            break
