#!/usr/bin/python2.2
###
# haiku - finds accidental 5-7-5 syllable constructs in free text
###
# $Id: haiku,v 1.5 2002/03/15 07:00:09 danny Exp $
###
# 
# Based on idea (and, sadly, lost implementation) by Don Marti.
# Requires the c06d file from  http://www.speech.cs.cmu.edu/cgi-bin/cmudict
#
# This code lives at http://www.oblomovka.com/code/ - please check there
# for full documentation, and latest versions.
#
#     Copyright 2002 Danny O'Brien <danny@spesh.com>
# 
#     This program is free software; you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation; either version 2 of the License, or
#     (at your option) any later version.
# 
#     This program is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.
# 
#     You should have be able to view the GNU General Public License at 
#     http://www.gnu.org/copyleft/gpl.html ; if not, write to the Free Software
#     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
#     "One person gains one
#     dollar by destroying two
#     dollars' worth of wealth."

from __future__ import generators
import sys, os, re, fileinput, getopt

def get_sy(word):
    """Return of syllables in word/syllable tuple"""
    return word[1]

def get_word(word):
    """Return word text from word/syllable tuple"""
    return word[0]
    
sy_dict = {}
def get_sy_count(word):
    """Return the number of syllables in word, 0 if word not recognised """
    uword = re.sub('[^A-Z\']','',word.upper())
    if (uword == ''): return 0
    if uword in sy_dict:     # memoize as we go
        return sy_dict[uword]
    cdu = re.split(r'\s+', os.popen('/bin/egrep "^%s " %s' % (uword,cmudict)).readline(), 1)
    if (cdu[0] != uword):
        sy_count = 0
    else:
        sy_count = len(re.findall("\d+", cdu[1]))
    sy_dict[uword] = sy_count
    return sy_count
    
def generate_sy_file():
    """Return a generator that iterates through file, returning word and
    syllable count tuples"""
    text = fileinput.input()
    while 1:
        l = text.readline()
        if not l: break
        w = re.split(r'\s+',l)
        for word in w:
            sy = get_sy_count(word)
            if (sy == 0):
                continue 
            yield (word,sy)
    return
   
def pp_haiku(list):
    """Pretty-print a list made up lists of lists of word syllable pairs as a
    human-readable verse. Refuses to print verses that don't reach the high
    standards of either the -c or -s options"""
    # does it end with a full stop? Did we *want* it to end with a fullstop?
    if ('-s' in options and not get_word(list[-1][-1])[-1] == '.'):
        return
    if ('-c' in options and not re.match('[A-Z]',get_word(list[0][0])[0])):
        return
    for a in list:
        for b in a:
            print get_word(b),
        print 
    print

prog_name = re.sub(".*/", "", sys.argv[0])
trycmu = ['/usr/local/share/c06d', '/usr/share/dict/c06d', 'c06d' ]
version = 0.03

usage = """haiku %s
usage: %s [ -c ] [ -s ] [ -h ] [ -d filename ] file...

Finds 5-7-5 syllable constructs in text files. Needs the %s file from
http://www.speech.cs.cmu.edu/cgi-bin/cmudict

-c  List only verses that begin with a capital letter
-s  List only verses that end with a fullstop
-d  Specify filename of %s dictionary
""" % (version, prog_name, trycmu[-1], trycmu[-1])
try:
   opts, newargv = getopt.getopt(sys.argv[1:], 'chsd:', ['help'])
except getopt.GetoptError:
   print usage
   raise SystemExit

sys.argv=sys.argv[0:1] + newargv
options={}
for (a,b) in opts:
    options[a]=b

if ('-h' in options or '--help' in options):
    print usage
    raise SystemExit

if ('-d' in options):
    trycmu= [ options['-d'] ] 

cmudict=''
for i in trycmu:
    if (os.path.exists(i)):
        cmudict = i
        break
if (cmudict == ''):
    raise IOError, 'Could not find %s, the Carnegie Mellon Pronunciation Dictionary' % trycmu[-1]

haiku_form = ( 5, 7, 5 )

sy_file = generate_sy_file() # iterator for walking through file
window = []                  # window into file
i = 0                        # index into window

full_haiku = []              # verse composed so far

haiku_index = 0              # index into file window

# build one line
while 1:                  
    haiku_line = []
    sy_count = 0
    sy_max = haiku_form[haiku_index]
    while (sy_count < sy_max): # suck up enough syllables
        try:
            sy_count = sy_count + get_sy(window[i])
        except IndexError:     # silently grab new words from file
            try:
                window.append(sy_file.next())
            except StopIteration: 
                break
            sy_count = sy_count + get_sy(window[i])
        haiku_line.append(window[i])
        i += 1

    if (sy_count == sy_max):   # have line?
        full_haiku.append(haiku_line)
        haiku_index += 1
        if (haiku_index != len(haiku_form)): # more lines to go?
            continue
        pp_haiku(full_haiku) # otherwise print the stanza, and backtrack

    # backtrack to beginning, but start with next word in file
    full_haiku = []
    haiku_index = 0
    i = 0
    while 1:
        try:
            window.pop(0)
        except IndexError: 
             raise SystemExit
        if (not window):
            try:
                window.append(sy_file.next())
            except StopIteration: 
               raise SystemExit 
        # if we're looking for capital letter verses, then skip lcase ones
        if (('-c' not in options) or (re.match('[A-Z]',window[0][0]))):
            break
