""" stringtools.py
    c odenthal
    2005/05/30

    This module contains some string functions that
    are meant to be useful in cryptanalysis.
"""

from __future__ import division

import string

UPPERCASE = string.ascii_uppercase
LOWERCASE = string.ascii_lowercase
LETTERS = string.ascii_letters

def stripPunc(strng):
    """ Strips all the non-letters out of a string.
    """
    return filter(lambda c: c in LETTERS, strng)

def stripDups(strng):
    """ Strips all repetitions out of a string.
    """
    return "".join([ c for (k,c) in enumerate(strng)
                        if c not in strng[:k]])

def letterSet(strng):
    """ Constructs a 'set' containing the letters in 'strng'.
    """
    return set(strng)

def digraphSet(strng):
    """ Constructs a 'set' containing the digraphs in 'strng'.
    """
    diLst = map(lambda t:"".join(t),
                zip(strng,strng[1:]))
    return set(diLst)

def trigraphSet(strng):
    """ Constructs a 'set' containing the trigraphs in 'strng'.
    """
    triLst = map(lambda t:"".join(t),
                zip(strng,strng[1:],strng[2:]))
    return set(triLst)

def freq2Perc(dct):
    """ Converts a dictionary having frequencies as values
        to a dictionary having percentages as values.
    """
    N = sum(dct.values())
    dP = {}
    for k,val in dct.items():
        dP[k] = 100*val/N
    return dP

def freq2Prob(dct):
    """ Converts a dictionary having frequencies as values
        to a dictionary having probablities as values.
    """
    N = sum(dct.values())
    dP = {}
    for k,val in dct.items():
        dP[k] = val/N
    return dP

def countOccur(strng,stSet):
    """ Counts the number of times the items (strings) in the
        set 'stSet' occur in the string 'strng'. A dictionary
        - with keys equal to the items in 'stSet' and values
        equal to the counts - is returned.
    """
    cntDct = {}
    for sub in stSet:
        cntDct[sub] = strng.count(sub)
    return cntDct

def letterFreq(strng):
    """ Constructs a dictionary with letters as keys and
        frequencies of those letters in strng as values.
    """
    strng = stripPunc(strng.upper())
    ltSet = letterSet(strng)
    return countOccur(strng,ltSet)

def digraphFreq(strng):
    """ Constructs a dictionary with digraphs as keys and
        frequencies of those digraphs in strng as values.
        The string is converted to uppercase before the
        count is done; also, non alphabetic characters are
        stripped out of the string before the count is done.
        So, for example, so the string 'Rt. R T' would have
        digraph 'RT' occuring twice and the digraph 'TR'
        occuring once.
    """
    strng = stripPunc(strng.upper())
    diSet = digraphSet(strng)
    return countOccur(strng,diSet)

def trigraphFreq(strng):
    """ Constructs a dictionary with trigraphs as keys and
        frequencies of those trigraphs in strng as values.
        Non alphabetic characters are stripped out of the
        string before the count is done. So, for example,
        the string 'Rt. R Tr' would have trigraph 'RTR'
        occuring twice and trigraph 'TRT' occuring once.
    """
    strng = stripPunc(strng.upper())
    triSet = trigraphSet(strng)
    return countOccur(strng,triSet)

def letterPerc(strng):
    """ Constructs a dictionary with letters as keys and
        percentages of those letters in strng as values.
    """
    return freq2Perc(letterFreq(strng))

def digraphPerc(strng):
    """ Constructs a dictionary with digraphs as keys and
        percentages of those digraphs in strng as values.
        Non alphabetic characters are stripped out of the
        string before the count is done. So, for example,
        the string 'S T' will show the digraph 'ST' as
        occuring once.
    """
    return freq2Perc(digraphFreq(strng))

def trigraphPerc(strng):
    """ Constructs a dictionary with trigraphs as keys and
        frequencies of those trigraphs in strng as values.
        Non alphabetic characters are stripped out of the
        string before the count is done. So, for example,
        the string 'ST. R' will show the trigraph 'STR' as
        occuring once.
    """
    return freq2Perc(trigraphFreq(strng))

def letterProb(strng):
    """ Constructs a dictionary with letters as keys and
        probabilities of those letters in strng as values.
    """
    return freq2Prob(letterFreq(strng))

def digraphProb(strng):
    """ Constructs a dictionary with digraphs as keys and
        probabilities of those digraphs in strng as values.
        Non alphabetic characters are stripped out of the
        string before the count is done. So, for example,
        the string 'S T' will show the digraph 'ST' as
        occuring once.
    """
    return freq2Prob(digraphFreq(strng))

def trigraphProb(strng):
    """ Constructs a dictionary with trigraphs as keys and
        frequencies of those trigraphs in strng as values.
        Non alphabetic characters are stripped out of the
        string before the count is done. So, for example,
        the string 'ST. R' will show the trigraph 'STR' as
        occuring once.
    """
    return freq2Prob(trigraphFreq(strng))

def breakIntoWords(strng,lngth=5):
    """ Breaks a string into a list of words, each of length 'lngth'
    """
    wLst = []
    while strng:
        s,strng = strng[:lngth],strng[lngth:]
        wLst.append(s)
    return wLst

def breakIntoCols(strng,nmbr=5):
    """ Breaks a string into a list of words obtained
        by listing the string 'in depth'. The words
        are the columns of the depth tableau. There
        are 'nmbr' columns.
    """
    # Add some spaces so the tableau comes out even.
    shrt = len(strng) % nmbr
    if shrt:
        strng += " "*(nmbr - shrt)

    # Break into words and then transpose into columns.
    colLst = transpose(breakIntoWords(strng,nmbr))

    # Strip out the extra spaces we put in before.
    return map(stripPunc,colLst)

def transpose(strngLst):
    """ Takes a list of strings and 'transposes' them.
        This returns another list of strings, the first
        string consists of the first character from each
        string from the original list, etc. The strings
        in the input list must all be of the same length.
    """
    charLstLst = apply(zip,strngLst)
    return ["".join(charTup) for charTup in charLstLst]