Frequency analysis (Vigenère)

Somehow less involved than the frequency analysis of a substitution cipher because it involves using frequency analysis to look for a keyword directly rather than for each letter in the cipher. Once the keyword is known, then determining the rest of the key is straightforward.

import modules/libraries
import string from collections import Counter from collections import OrderedDict import pandas as pd import numpy as np import re import matplotlib.pyplot as plt %matplotlib inline import itertools as itertools from itertools import izip from itertools import cycle

import ciphertext (from Simon Singh’s book)

ciphertext_raw = "WUBEFIQLZURMVOFEHMYMWT",\ "IXCGTMPIFKRZUPMVOIRQMM",\ "WOZMPULMBNYVQQQMVMVJLE",\ "YMHFEFNZPSDLPPSDLPEVQM",\ "WCXYMDAVQEEFIQCAYTQOWC",\ "XYMWMSEMEFCFWYEYQETRLI",\ "QYCGMTWCWFBSMYFPLRXTQY",\ "EEXMRULUKSGWFPTLRQAERL",\ "UVPMVYQYCXTWFQLMTELSFJ",\ "PQEHMOZCIWCIWFPZSLMAEZ",\ "IQVLQMZVPPXAWCSMZMORVG",\ "VVQSZETRLQZPBJAZVQIYXE",\ "WWOICCGDWHQMMVOWSGNTJP",\ "FPPAYBIYBJUTWRLQKLLLMD",\ "PYVACDCFQNZPIFPPKSDVPT",\ "IDQXMQQVEBMQALKEZMGCVK",\ "UZKIZBZLIUAMMVZ"

ciphertext = '' for letter in ciphertext_raw: ciphertext = ciphertext + letter print ciphertext

gives:

WUBEFIQLZURMVOFEHMYMWTIXCGTMPIFKRZUPMVOIRQ

MMWOZMPULMBNYVQQQMVMVJLEYMHFEFNZPSDLPPSDL

PEVQMWCXYMDAVQEEFIQCAYTQOWCXYMWMSEMEFCFWY

EYQETRLIQYCGMTWCWFBSMYFPLRXTQYEEXMRULUKSGWF

PTLRQAERLUVPMVYQYCXTWFQLMTELSFJPQEHMOZCIWCIW

FPZSLMAEZIQVLQMZVPPXAWCSMZMORVGVVQSZETRLQZPB

JAZVQIYXEWWOICCGDWHQMMVOWSGNTJPFPPAYBIYBJUTW

RLQKLLLMDPYVACDCFQNZPIFPPKSDVPTIDQXMQQVEBMQAL

KEZMGCVKUZKIZBZLIUAMMVZ

choose a minimum of four repeated characters and scroll through the ciphertext looking for any repeats

matches = [] i = 0 while i < len(ciphertext)-4: pattern = ciphertext[i:(i+4)] match = re.findall(pattern,ciphertext) matches.append(match) i += 1

identify the repeated units in the ciphertext.
first_pass = [] for match in matches: if len(match) > 1: first_pass.append(match) second_pass = [] for item in first_pass: second_pass.append(item[0]) second_pass = set(second_pass) second_pass = list(second_pass)

if there are strings that share more than two letters and they are next to one another then these represent strings that repeated which where longer than four letters in length. Identify.

third_pass = [] to_remove = [] i = 0 j = 0 while i < len(second_pass): j = 0 while j < len(second_pass): if i == j: j += 1 else: a = second_pass[i][0:3] b = second_pass[j][1:4] if a != b: j += 1 else: n = second_pass[i] to_remove.append(n) n = str(n) m = second_pass[j] to_remove.append(m) m = str(m[0]) pattern = m+n third_pass.append(pattern) j += 1 i += 1 for comb in to_remove: if comb in second_pass: second_pass.remove(comb) for comb in third_pass: second_pass.append(comb) final_combinations = second_pass print "repeated combinations found: ",final_combinations

gives:

repeated combinations found: [‘ETRL’, ‘EFIQ’, ‘PSDLP’, ‘WCXYM’]

Determine repeat spacing in ciphertext
spacing_dict = {} for comb in final_combinations: index = 0 word_length = len(comb) spacing = [] while index < len(ciphertext): ciphertext_slice= ciphertext[index:index+word_length] if ciphertext_slice == comb: spacing.append(index) index += 1 else: index += 1 spacing_dict[comb] = spacing

determine differences between the repeat spacing

for k,v in spacing_dict.iteritems(): spacings = v i = 0 j = 1 differences = [] while j<len(spacings): a = spacings[i] b = spacings[j] diff = b - a differences.append(diff) i += 1 j += 1 spacing_dict[k] = differences print "repeat spacing of the different combinations are: ",spacing_dict

gives:

repeat spacing of the different combinations are: {‘ETRL’: [120], ‘EFIQ’: [95], ‘PSDLP’: [5], ‘WCXYM’: [20]}

#this very nice algorithm is taken from http://stackoverflow.com/questions/15347174/python-finding-prime-factors. I was looking for something simipler and more elegant than my approach and this fits the bill nicely, posted by Stefan

def prime_factors(n): i = 2 factors = [] while i * i <= n: if n % i: i += 1 else: n //= i factors.append(i) if n > 1: factors.append(n) return factors

determine factors for each of the each of the patterns

for k,v in spacing_dict.iteritems(): factor_list= [] for spacing in v: factors = prime_factors(spacing) factor_list.append(factors) spacing_dict[k] = factor_list print spacing_dict

find the common factor between the different sets:
consolidated_factors = [] for k,v in spacing_dict.iteritems(): i = 0 while i < len(v): factors = v[i] consolidated_factors.append(factors) i += 1 print 'factors for each of the spacing patterns: ',consolidated_factors

gives:

{‘ETRL’: [[2, 2, 2, 3, 5]], ‘EFIQ’: [[5, 19]], ‘PSDLP’: [[5]], ‘WCXYM’: [[2, 2, 5]]}
factors for each of the spacing patterns: [[2, 2, 2, 3, 5], [5, 19], [5], [2, 2, 5]]

from the factors, find the common factor amongst them – this is likely to be the key word length

number_lists = len(consolidated_factors) remaining = [] a = consolidated_factors[0] temp = set(a) i = 1 while i < len(consolidated_factors)-1: b = consolidated_factors[i+1] temp = temp & set(b) i+=1 factor = list(temp) factor = factor[0] print "word length of key is likely to be: ",factor

gives:

word length of key is likely to be: 5

prepare dataframe from frequency analysis comparison. Frequency are taken from Fig.15 p.73 “The Code Book”, Simon Singh

letters = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z'] frequencies = [6,1,2,3,9,2,1,5,5,0,1,3,2,5,6,1,0,4,5,7,2,1,2,0,1,0] x_pos = np.arange(len(letters)) plt.bar(x_pos,frequencies) plt.xticks(x_pos, letters) plt.show()

Get frequencies of letters corresponding to each of the five letters in the key for each, maximise overlap with plaintext dataframe. offset = key_letter.

strings = {} i = 0 while i < factor: j=i strings_temp = [] while j < len(ciphertext)-factor: letter = ciphertext[j] strings_temp.append(letter) j = j + factor strings[i] = strings_temp i += 1 freq_tables = {num: {chr(num+97):0 for num in range(0,26,1)} for num in range(0,factor,1)}

compile the frequencies of the letters at each of word spacings throughout the ciphertext
cipher_freq_table = {} i=0 while i < factor: string_n = sorted(strings[i]) string_n = str(string_n) string_n = string_n.lower() string_n = string_n.replace(' ','') string_n = string_n.replace('[','') string_n = string_n.replace(']','') string_n = string_n.replace(',','') string_n = string_n.replace("'",'') cipher_freq = (Counter(string_n)) cipher_freq_table[i] = cipher_freq i+=1 i=0 while i < factor: for k,v in cipher_freq_table[i].iteritems(): freq_tables[i][k] = v i += 1

determine their frequencies as a %
total_letters = float(len(ciphertext)/5) i=0 while i < factor: letters_list = [] frequency_list = [] for k,v in freq_tables[i].iteritems(): floated = float(v) percented = (floated/total_letters)*100 rounded = round(percented,1) freq_tables[i][k] = rounded i += 1 print freq_tables

for ease of reference, compile into a pandas dataaframe

dict_list = [v for v in freq_tables.iteritems()] simple_dict_list = [] i = 0 while i < factor: simple_dict_list.append(dict_list[i][1]) i += 1 df = pd.DataFrame(simple_dict_list) print df.head()

a b c d e f g h i j … q r s \
0 0.0 0.0 2.7 0.0 8.2 2.7 4.1 0.0 11.0 2.7 … 1.4 4.1 9.6
1 4.1 1.4 0.0 6.8 1.4 12.3 2.7 2.7 2.7 0.0 … 20.5 0.0 0.0
2 4.1 5.5 2.7 1.4 4.1 0.0 0.0 0.0 4.1 1.4 … 6.8 1.4 0.0
3 1.4 0.0 4.1 2.7 11.0 1.4 1.4 2.7 0.0 1.4 … 1.4 1.4 2.7
4 2.7 4.1 12.3 0.0 4.1 6.8 1.4 0.0 4.1 1.4 … 8.2 8.2 2.7

t u v w x y z
0 1.4 0.0 11.0 8.2 6.8 4.1 0.0
1 5.5 6.8 0.0 0.0 1.4 6.8 8.2
2 4.1 1.4 13.7 6.8 1.4 0.0 2.7
3 5.5 0.0 2.7 6.8 2.7 8.2 9.6
4 1.4 5.5 0.0 4.1 0.0 4.1 2.7

[5 rows x 26 columns]

add the normal frequencies to the dataframe
frequencies_float = [] for frequency in frequencies: frequency = float(frequency) frequencies_float.append(frequency) letters_list = [letter for letter in string.ascii_lowercase] df2 = pd.DataFrame(frequencies_float,index=letters_list) df2.columns df2.columns = ['norm'] df2 = df2.T df3 = pd.concat([df,df2], axis=0) df3 = df3.T

shifts_dict = {} j = 0 while j < factor: norm = df3['norm'] norm_array = np.array(norm) s1 = df3[j] diff2_list = [] #shift_list = [] i = 0 while i < 25: a = s1[:i] b = s1[i:] c = pd.concat([b,a]) c_array = np.array(c) diff = norm_array - c_array diff2 = diff**2 diff2_sum = diff2.sum() diff2_list.append(diff2_sum) i += 1 most_overlap = min(diff2_list) index = 0 for item in diff2_list: if item == most_overlap: break else: index += 1 #print index a = s1[:index] b = s1[index:] viginere_shift = pd.concat([b,a]) shifts_dict[j] = viginere_shift j += 1

because data is stored as pandas series, and we want to find the label of the data, rather than the data point itself. we need to reference it by its value, in this case the topmost data value for each series.

top_values = [] for k,v in shifts_dict.iteritems(): top = v[0] top_values.append(top)

key_list = [] i = 0 while i < factor: series1 = shifts_dict[i] key_letter = series1[series1==top_values[i]].index key_letter = key_letter[0] key_list.append(key_letter) i += 1 print "the key is likely to be: ",key_list

gives:

the key is likely to be: [‘e’, ‘m’, ‘i’, ‘l’, ‘y’]

to decipher, import the key and prepare the ciphertext

ciphertext = ciphertext.lower() ciphertext = ciphertext.replace(" ", "") ciphertext = list(ciphertext) ciphertext_list = [] for letter in ciphertext: ciphertext_list.append(letter)

create the Vigenere square, to encipher and decipher

square = {(num+1):(num+1) for num in range(0,26,1)} alphalist = [chr(num+97) for num in range(0,26,1)] for k,v in square.iteritems(): linelista = alphalist[v:] linelistb = alphalist[:v] linelistc = [] for letter in linelista: linelistc.append(letter) for letter in linelistb: linelistc.append(letter) square[k] = linelistc linelista = [] linelistb = [] linelistc = [] for k,v in square.iteritems(): new_key = square[k][0] square[new_key] = square.pop(k)

estbalish the cipher alphabets for each letter in the ciphertext, using the key

alphalist_match = [] for item in itertools.izip(ciphertext_list, itertools.cycle(key_list)): alphalist_match.append(item)

choose the correct plaintext letter for each ciphertext letter

alphalist_return_index = zip(alphalist,range(0,25,1)) alphalist_return_index = dict(alphalist_return_index) for k,v in alphalist_return_index.iteritems(): new_key = v alphalist_return_index[new_key] = alphalist_return_index.pop(k) alphalist_return_index[new_key] = k

perform the substitution

plaintext = '' for item in alphalist_match: a,b = item for index,letter in enumerate(square[b]): if letter==a: return_index = index plaintext_letter = alphalist_return_index[return_index] plaintext = plaintext+plaintext_letter print "plaintext is: ",plaintext

gives:

plaintext is: sittheedownandhavenoshamecheekbyjowlandkneebyknee

whatcareiforanynamewhatfororderordegreeletmescrewtheeupapeglet

meloosethytonguewithwinecallestthouthatthingalegwhichisthinnestthine

orminethoushaltnotbesavedbyworksthouhastbeenasinnertooruinedtrunkson

witheredforksemptyscarecrowsiandyoufillthecupandfillthecanhavearouse

beforethemorneverimomentdiesamaneverymomentoneisborn

Frequency analysis (Vigenère)

Share this: