""" stringtools.py c odenthal 2005/05/30 This module contains some string functions that are meant to be useful in cryptanalysis. """ from __future__ import division import string UPPERCASE = string.ascii_uppercase LOWERCASE = string.ascii_lowercase LETTERS = string.ascii_letters def stripPunc(strng): """ Strips all the non-letters out of a string. """ return filter(lambda c: c in LETTERS, strng) def stripDups(strng): """ Strips all repetitions out of a string. """ return "".join([ c for (k,c) in enumerate(strng) if c not in strng[:k]]) def letterSet(strng): """ Constructs a 'set' containing the letters in 'strng'. """ return set(strng) def digraphSet(strng): """ Constructs a 'set' containing the digraphs in 'strng'. """ diLst = map(lambda t:"".join(t), zip(strng,strng[1:])) return set(diLst) def trigraphSet(strng): """ Constructs a 'set' containing the trigraphs in 'strng'. """ triLst = map(lambda t:"".join(t), zip(strng,strng[1:],strng[2:])) return set(triLst) def freq2Perc(dct): """ Converts a dictionary having frequencies as values to a dictionary having percentages as values. """ N = sum(dct.values()) dP = {} for k,val in dct.items(): dP[k] = 100*val/N return dP def freq2Prob(dct): """ Converts a dictionary having frequencies as values to a dictionary having probablities as values. """ N = sum(dct.values()) dP = {} for k,val in dct.items(): dP[k] = val/N return dP def countOccur(strng,stSet): """ Counts the number of times the items (strings) in the set 'stSet' occur in the string 'strng'. A dictionary - with keys equal to the items in 'stSet' and values equal to the counts - is returned. """ cntDct = {} for sub in stSet: cntDct[sub] = strng.count(sub) return cntDct def letterFreq(strng): """ Constructs a dictionary with letters as keys and frequencies of those letters in strng as values. """ strng = stripPunc(strng.upper()) ltSet = letterSet(strng) return countOccur(strng,ltSet) def digraphFreq(strng): """ Constructs a dictionary with digraphs as keys and frequencies of those digraphs in strng as values. The string is converted to uppercase before the count is done; also, non alphabetic characters are stripped out of the string before the count is done. So, for example, so the string 'Rt. R T' would have digraph 'RT' occuring twice and the digraph 'TR' occuring once. """ strng = stripPunc(strng.upper()) diSet = digraphSet(strng) return countOccur(strng,diSet) def trigraphFreq(strng): """ Constructs a dictionary with trigraphs as keys and frequencies of those trigraphs in strng as values. Non alphabetic characters are stripped out of the string before the count is done. So, for example, the string 'Rt. R Tr' would have trigraph 'RTR' occuring twice and trigraph 'TRT' occuring once. """ strng = stripPunc(strng.upper()) triSet = trigraphSet(strng) return countOccur(strng,triSet) def letterPerc(strng): """ Constructs a dictionary with letters as keys and percentages of those letters in strng as values. """ return freq2Perc(letterFreq(strng)) def digraphPerc(strng): """ Constructs a dictionary with digraphs as keys and percentages of those digraphs in strng as values. Non alphabetic characters are stripped out of the string before the count is done. So, for example, the string 'S T' will show the digraph 'ST' as occuring once. """ return freq2Perc(digraphFreq(strng)) def trigraphPerc(strng): """ Constructs a dictionary with trigraphs as keys and frequencies of those trigraphs in strng as values. Non alphabetic characters are stripped out of the string before the count is done. So, for example, the string 'ST. R' will show the trigraph 'STR' as occuring once. """ return freq2Perc(trigraphFreq(strng)) def letterProb(strng): """ Constructs a dictionary with letters as keys and probabilities of those letters in strng as values. """ return freq2Prob(letterFreq(strng)) def digraphProb(strng): """ Constructs a dictionary with digraphs as keys and probabilities of those digraphs in strng as values. Non alphabetic characters are stripped out of the string before the count is done. So, for example, the string 'S T' will show the digraph 'ST' as occuring once. """ return freq2Prob(digraphFreq(strng)) def trigraphProb(strng): """ Constructs a dictionary with trigraphs as keys and frequencies of those trigraphs in strng as values. Non alphabetic characters are stripped out of the string before the count is done. So, for example, the string 'ST. R' will show the trigraph 'STR' as occuring once. """ return freq2Prob(trigraphFreq(strng)) def breakIntoWords(strng,lngth=5): """ Breaks a string into a list of words, each of length 'lngth' """ wLst = [] while strng: s,strng = strng[:lngth],strng[lngth:] wLst.append(s) return wLst def breakIntoCols(strng,nmbr=5): """ Breaks a string into a list of words obtained by listing the string 'in depth'. The words are the columns of the depth tableau. There are 'nmbr' columns. """ # Add some spaces so the tableau comes out even. shrt = len(strng) % nmbr if shrt: strng += " "*(nmbr - shrt) # Break into words and then transpose into columns. colLst = transpose(breakIntoWords(strng,nmbr)) # Strip out the extra spaces we put in before. return map(stripPunc,colLst) def transpose(strngLst): """ Takes a list of strings and 'transposes' them. This returns another list of strings, the first string consists of the first character from each string from the original list, etc. The strings in the input list must all be of the same length. """ charLstLst = apply(zip,strngLst) return ["".join(charTup) for charTup in charLstLst]