Somehow less involved than the frequency analysis of a substitution cipher because it involves using frequency analysis to look for a keyword directly rather than for each letter in the cipher. Once the keyword is known, then determining the rest of the key is straightforward.
import modules/libraries
import string
from collections import Counter
from collections import OrderedDict
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
%matplotlib inline
import itertools as itertools
from itertools import izip
from itertools import cycle
import ciphertext (from Simon Singh’s book)
ciphertext_raw = "WUBEFIQLZURMVOFEHMYMWT",\
"IXCGTMPIFKRZUPMVOIRQMM",\
"WOZMPULMBNYVQQQMVMVJLE",\
"YMHFEFNZPSDLPPSDLPEVQM",\
"WCXYMDAVQEEFIQCAYTQOWC",\
"XYMWMSEMEFCFWYEYQETRLI",\
"QYCGMTWCWFBSMYFPLRXTQY",\
"EEXMRULUKSGWFPTLRQAERL",\
"UVPMVYQYCXTWFQLMTELSFJ",\
"PQEHMOZCIWCIWFPZSLMAEZ",\
"IQVLQMZVPPXAWCSMZMORVG",\
"VVQSZETRLQZPBJAZVQIYXE",\
"WWOICCGDWHQMMVOWSGNTJP",\
"FPPAYBIYBJUTWRLQKLLLMD",\
"PYVACDCFQNZPIFPPKSDVPT",\
"IDQXMQQVEBMQALKEZMGCVK",\
"UZKIZBZLIUAMMVZ"
ciphertext = ''
for letter in ciphertext_raw:
ciphertext = ciphertext + letter
print ciphertext
gives:
WUBEFIQLZURMVOFEHMYMWTIXCGTMPIFKRZUPMVOIRQ
MMWOZMPULMBNYVQQQMVMVJLEYMHFEFNZPSDLPPSDL
PEVQMWCXYMDAVQEEFIQCAYTQOWCXYMWMSEMEFCFWY
EYQETRLIQYCGMTWCWFBSMYFPLRXTQYEEXMRULUKSGWF
PTLRQAERLUVPMVYQYCXTWFQLMTELSFJPQEHMOZCIWCIW
FPZSLMAEZIQVLQMZVPPXAWCSMZMORVGVVQSZETRLQZPB
JAZVQIYXEWWOICCGDWHQMMVOWSGNTJPFPPAYBIYBJUTW
RLQKLLLMDPYVACDCFQNZPIFPPKSDVPTIDQXMQQVEBMQAL
KEZMGCVKUZKIZBZLIUAMMVZ
choose a minimum of four repeated characters and scroll through the ciphertext looking for any repeats
matches = []
i = 0
while i < len(ciphertext)-4:
pattern = ciphertext[i:(i+4)]
match = re.findall(pattern,ciphertext)
matches.append(match)
i += 1
identify the repeated units in the ciphertext.
first_pass = []
for match in matches:
if len(match) > 1:
first_pass.append(match)
second_pass = []
for item in first_pass:
second_pass.append(item[0])
second_pass = set(second_pass)
second_pass = list(second_pass)
if there are strings that share more than two letters and they are next to one another then these represent strings that repeated which where longer than four letters in length. Identify.
third_pass = []
to_remove = []
i = 0
j = 0
while i < len(second_pass):
j = 0
while j < len(second_pass):
if i == j:
j += 1
else:
a = second_pass[i][0:3]
b = second_pass[j][1:4]
if a != b:
j += 1
else:
n = second_pass[i]
to_remove.append(n)
n = str(n)
m = second_pass[j]
to_remove.append(m)
m = str(m[0])
pattern = m+n
third_pass.append(pattern)
j += 1
i += 1
for comb in to_remove:
if comb in second_pass:
second_pass.remove(comb)
for comb in third_pass:
second_pass.append(comb)
final_combinations = second_pass
print "repeated combinations found: ",final_combinations
gives:
repeated combinations found: [‘ETRL’, ‘EFIQ’, ‘PSDLP’, ‘WCXYM’]
Determine repeat spacing in ciphertext
spacing_dict = {}
for comb in final_combinations:
index = 0
word_length = len(comb)
spacing = []
while index < len(ciphertext):
ciphertext_slice= ciphertext[index:index+word_length]
if ciphertext_slice == comb:
spacing.append(index)
index += 1
else:
index += 1
spacing_dict[comb] = spacing
determine differences between the repeat spacing
for k,v in spacing_dict.iteritems():
spacings = v
i = 0
j = 1
differences = []
while j<len(spacings):
a = spacings[i]
b = spacings[j]
diff = b - a
differences.append(diff)
i += 1
j += 1
spacing_dict[k] = differences
print "repeat spacing of the different combinations are: ",spacing_dict
gives:
repeat spacing of the different combinations are: {‘ETRL’: [120], ‘EFIQ’: [95], ‘PSDLP’: [5], ‘WCXYM’: [20]}
#this very nice algorithm is taken from http://stackoverflow.com/questions/15347174/python-finding-prime-factors. I was looking for something simipler and more elegant than my approach and this fits the bill nicely, posted by Stefan
def prime_factors(n):
i = 2
factors = []
while i * i <= n: if n % i: i += 1 else: n //= i factors.append(i) if n > 1:
factors.append(n)
return factors
determine factors for each of the each of the patterns
for k,v in spacing_dict.iteritems():
factor_list= []
for spacing in v:
factors = prime_factors(spacing)
factor_list.append(factors)
spacing_dict[k] = factor_list
print spacing_dict
find the common factor between the different sets:
consolidated_factors = []
for k,v in spacing_dict.iteritems():
i = 0
while i < len(v):
factors = v[i]
consolidated_factors.append(factors)
i += 1
print 'factors for each of the spacing patterns: ',consolidated_factors
gives:
{‘ETRL’: [[2, 2, 2, 3, 5]], ‘EFIQ’: [[5, 19]], ‘PSDLP’: [[5]], ‘WCXYM’: [[2, 2, 5]]}
factors for each of the spacing patterns: [[2, 2, 2, 3, 5], [5, 19], [5], [2, 2, 5]]
from the factors, find the common factor amongst them – this is likely to be the key word length
number_lists = len(consolidated_factors)
remaining = []
a = consolidated_factors[0]
temp = set(a)
i = 1
while i < len(consolidated_factors)-1:
b = consolidated_factors[i+1]
temp = temp & set(b)
i+=1
factor = list(temp)
factor = factor[0]
print "word length of key is likely to be: ",factor
gives:
word length of key is likely to be: 5
prepare dataframe from frequency analysis comparison. Frequency are taken from Fig.15 p.73 “The Code Book”, Simon Singh
letters = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
frequencies = [6,1,2,3,9,2,1,5,5,0,1,3,2,5,6,1,0,4,5,7,2,1,2,0,1,0]
x_pos = np.arange(len(letters))
plt.bar(x_pos,frequencies)
plt.xticks(x_pos, letters)
plt.show()

Get frequencies of letters corresponding to each of the five letters in the key for each, maximise overlap with plaintext dataframe. offset = key_letter.
strings = {}
i = 0
while i < factor:
j=i
strings_temp = []
while j < len(ciphertext)-factor:
letter = ciphertext[j]
strings_temp.append(letter)
j = j + factor
strings[i] = strings_temp
i += 1
freq_tables = {num: {chr(num+97):0 for num in range(0,26,1)} for num in range(0,factor,1)}
compile the frequencies of the letters at each of word spacings throughout the ciphertext
cipher_freq_table = {}
i=0
while i < factor:
string_n = sorted(strings[i])
string_n = str(string_n)
string_n = string_n.lower()
string_n = string_n.replace(' ','')
string_n = string_n.replace('[','')
string_n = string_n.replace(']','')
string_n = string_n.replace(',','')
string_n = string_n.replace("'",'')
cipher_freq = (Counter(string_n))
cipher_freq_table[i] = cipher_freq
i+=1
i=0
while i < factor:
for k,v in cipher_freq_table[i].iteritems():
freq_tables[i][k] = v
i += 1
determine their frequencies as a %
total_letters = float(len(ciphertext)/5)
i=0
while i < factor:
letters_list = []
frequency_list = []
for k,v in freq_tables[i].iteritems():
floated = float(v)
percented = (floated/total_letters)*100
rounded = round(percented,1)
freq_tables[i][k] = rounded
i += 1
print freq_tables
for ease of reference, compile into a pandas dataaframe
dict_list = [v for v in freq_tables.iteritems()]
simple_dict_list = []
i = 0
while i < factor:
simple_dict_list.append(dict_list[i][1])
i += 1
df = pd.DataFrame(simple_dict_list)
print df.head()
a b c d e f g h i j … q r s \
0 0.0 0.0 2.7 0.0 8.2 2.7 4.1 0.0 11.0 2.7 … 1.4 4.1 9.6
1 4.1 1.4 0.0 6.8 1.4 12.3 2.7 2.7 2.7 0.0 … 20.5 0.0 0.0
2 4.1 5.5 2.7 1.4 4.1 0.0 0.0 0.0 4.1 1.4 … 6.8 1.4 0.0
3 1.4 0.0 4.1 2.7 11.0 1.4 1.4 2.7 0.0 1.4 … 1.4 1.4 2.7
4 2.7 4.1 12.3 0.0 4.1 6.8 1.4 0.0 4.1 1.4 … 8.2 8.2 2.7
t u v w x y z
0 1.4 0.0 11.0 8.2 6.8 4.1 0.0
1 5.5 6.8 0.0 0.0 1.4 6.8 8.2
2 4.1 1.4 13.7 6.8 1.4 0.0 2.7
3 5.5 0.0 2.7 6.8 2.7 8.2 9.6
4 1.4 5.5 0.0 4.1 0.0 4.1 2.7
[5 rows x 26 columns]
add the normal frequencies to the dataframe
frequencies_float = []
for frequency in frequencies:
frequency = float(frequency)
frequencies_float.append(frequency)
letters_list = [letter for letter in string.ascii_lowercase]
df2 = pd.DataFrame(frequencies_float,index=letters_list)
df2.columns
df2.columns = ['norm']
df2 = df2.T
df3 = pd.concat([df,df2], axis=0)
df3 = df3.T
shifts_dict = {}
j = 0
while j < factor:
norm = df3['norm']
norm_array = np.array(norm)
s1 = df3[j]
diff2_list = []
#shift_list = []
i = 0
while i < 25:
a = s1[:i]
b = s1[i:]
c = pd.concat([b,a])
c_array = np.array(c)
diff = norm_array - c_array
diff2 = diff**2
diff2_sum = diff2.sum()
diff2_list.append(diff2_sum)
i += 1
most_overlap = min(diff2_list)
index = 0
for item in diff2_list:
if item == most_overlap:
break
else:
index += 1
#print index
a = s1[:index]
b = s1[index:]
viginere_shift = pd.concat([b,a])
shifts_dict[j] = viginere_shift
j += 1
because data is stored as pandas series, and we want to find the label of the data, rather than the data point itself. we need to reference it by its value, in this case the topmost data value for each series.
top_values = []
for k,v in shifts_dict.iteritems():
top = v[0]
top_values.append(top)
key_list = []
i = 0
while i < factor:
series1 = shifts_dict[i]
key_letter = series1[series1==top_values[i]].index
key_letter = key_letter[0]
key_list.append(key_letter)
i += 1
print "the key is likely to be: ",key_list
gives:
the key is likely to be: [‘e’, ‘m’, ‘i’, ‘l’, ‘y’]
to decipher, import the key and prepare the ciphertext
ciphertext = ciphertext.lower()
ciphertext = ciphertext.replace(" ", "")
ciphertext = list(ciphertext)
ciphertext_list = []
for letter in ciphertext:
ciphertext_list.append(letter)
create the Vigenere square, to encipher and decipher
square = {(num+1):(num+1) for num in range(0,26,1)}
alphalist = [chr(num+97) for num in range(0,26,1)]
for k,v in square.iteritems():
linelista = alphalist[v:]
linelistb = alphalist[:v]
linelistc = []
for letter in linelista:
linelistc.append(letter)
for letter in linelistb:
linelistc.append(letter)
square[k] = linelistc
linelista = []
linelistb = []
linelistc = []
for k,v in square.iteritems():
new_key = square[k][0]
square[new_key] = square.pop(k)
estbalish the cipher alphabets for each letter in the ciphertext, using the key
alphalist_match = []
for item in itertools.izip(ciphertext_list, itertools.cycle(key_list)):
alphalist_match.append(item)
choose the correct plaintext letter for each ciphertext letter
alphalist_return_index = zip(alphalist,range(0,25,1))
alphalist_return_index = dict(alphalist_return_index)
for k,v in alphalist_return_index.iteritems():
new_key = v
alphalist_return_index[new_key] = alphalist_return_index.pop(k)
alphalist_return_index[new_key] = k
perform the substitution
plaintext = ''
for item in alphalist_match:
a,b = item
for index,letter in enumerate(square[b]):
if letter==a:
return_index = index
plaintext_letter = alphalist_return_index[return_index]
plaintext = plaintext+plaintext_letter
print "plaintext is: ",plaintext
gives:
plaintext is: sittheedownandhavenoshamecheekbyjowlandkneebyknee
whatcareiforanynamewhatfororderordegreeletmescrewtheeupapeglet
meloosethytonguewithwinecallestthouthatthingalegwhichisthinnestthine
orminethoushaltnotbesavedbyworksthouhastbeenasinnertooruinedtrunkson
witheredforksemptyscarecrowsiandyoufillthecupandfillthecanhavearouse
beforethemorneverimomentdiesamaneverymomentoneisborn