|
|
|
|
@ -3,125 +3,86 @@
|
|
|
|
|
"""
|
|
|
|
|
Python module to derive a key from an monoalphabetically encrypted file.
|
|
|
|
|
"""
|
|
|
|
|
import itertools as it
|
|
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
from collections import OrderedDict
|
|
|
|
|
from collections import Counter
|
|
|
|
|
from string import ascii_lowercase
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Breaker():
|
|
|
|
|
|
|
|
|
|
## frequency analysis
|
|
|
|
|
EN_LETTER_FREQ = list("etaoinsrhdlucmfywgpbvkxqjz")
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def get_frequency(text):
|
|
|
|
|
freq = Counter(text)
|
|
|
|
|
# Counter with lowercase ascii letters all having a count of 0
|
|
|
|
|
missing = Counter(dict(it.product(ascii_lowercase, [0])))
|
|
|
|
|
freq.update(missing)
|
|
|
|
|
return freq
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def derive_alphabet_freq(freq: Counter):
|
|
|
|
|
most_freq = [ item[0] for item in freq.most_common() ]
|
|
|
|
|
#return OrderedDict(zip(EN_LETTER_FREQ, most_freq))
|
|
|
|
|
return OrderedDict(zip(Breaker.EN_LETTER_FREQ, most_freq))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## pattern matching
|
|
|
|
|
@staticmethod
|
|
|
|
|
def get_word_containing(word_file, char_list: list):
|
|
|
|
|
"""
|
|
|
|
|
Find word from a word list file (common.txt) containing the chars
|
|
|
|
|
in `char_list'.
|
|
|
|
|
Return None it no word matches or
|
|
|
|
|
TUPLE(word, pos) where `pos' is a LIST of matching positions.
|
|
|
|
|
"""
|
|
|
|
|
with open(word_file, 'r') as f:
|
|
|
|
|
for line in f:
|
|
|
|
|
word = line[:-1]
|
|
|
|
|
pos = []
|
|
|
|
|
for char in char_list:
|
|
|
|
|
pos.append(word.find(char))
|
|
|
|
|
if -1 not in pos:
|
|
|
|
|
return word, pos
|
|
|
|
|
return None, None
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def positions(text: str, sub):
|
|
|
|
|
index = text.find(sub)
|
|
|
|
|
while index != -1:
|
|
|
|
|
yield index
|
|
|
|
|
index = text.find(sub, index + 1)
|
|
|
|
|
return index
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def match_ciphertext(text: str, word_pos: tuple, char: tuple):
|
|
|
|
|
"""
|
|
|
|
|
asdf
|
|
|
|
|
"""
|
|
|
|
|
word, wposl = word_pos
|
|
|
|
|
wpos = wposl[0]
|
|
|
|
|
wlen = len(word)
|
|
|
|
|
|
|
|
|
|
snip_count = Counter()
|
|
|
|
|
for pos in Breaker.positions(text, char):
|
|
|
|
|
word_begin = pos - wpos
|
|
|
|
|
snippet = text[word_begin : word_begin + wlen]
|
|
|
|
|
|
|
|
|
|
if snippet not in snip_count.elements():
|
|
|
|
|
snip_count[snippet] = text.count(snippet)
|
|
|
|
|
|
|
|
|
|
return snip_count.most_common(1)[0][0]
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def choose_known_letters(key_alphabet):
|
|
|
|
|
letters = list(key_alphabet.keys())
|
|
|
|
|
if len(key_alphabet) < 3:
|
|
|
|
|
yield letters
|
|
|
|
|
else:
|
|
|
|
|
for i in letters:
|
|
|
|
|
for j in letters:
|
|
|
|
|
for k in letters:
|
|
|
|
|
if k == j or k == i or j == i: continue
|
|
|
|
|
yield [i, j, k]
|
|
|
|
|
return None
|
|
|
|
|
EN_LETTER_FREQ = list("etaoinsrhdlucmfywgpbvkxqjz")
|
|
|
|
|
|
|
|
|
|
def __init__(self, ciphertext, word_file):
|
|
|
|
|
self.ciphertext = ciphertext
|
|
|
|
|
self.alph = self.derive_alphabet_freq(self.get_frequency(ciphertext))
|
|
|
|
|
self.word_file = word_file
|
|
|
|
|
|
|
|
|
|
most_freq_cipher = Counter(ciphertext).most_common(1)[0][0]
|
|
|
|
|
|
|
|
|
|
def get_key(self):
|
|
|
|
|
# excuse me:
|
|
|
|
|
words = []
|
|
|
|
|
with open(word_file, 'r') as wf:
|
|
|
|
|
for line in wf:
|
|
|
|
|
words.append(line[:-1]) # remove trailing newline and append
|
|
|
|
|
|
|
|
|
|
self.text = ciphertext
|
|
|
|
|
self.key = { Breaker.EN_LETTER_FREQ[0] : most_freq_cipher }
|
|
|
|
|
self.words = words
|
|
|
|
|
|
|
|
|
|
def choose_word(self):
|
|
|
|
|
known_chars = self.key.keys()
|
|
|
|
|
for i in range(len(self.words)):
|
|
|
|
|
word = self.words[i]
|
|
|
|
|
|
|
|
|
|
key_alphabet = OrderedDict()
|
|
|
|
|
if len(word) == 0: continue
|
|
|
|
|
|
|
|
|
|
# most frequent char in English and corresponding most common char in text
|
|
|
|
|
most_freq = self.alph[Breaker.EN_LETTER_FREQ]
|
|
|
|
|
# count known chars in word
|
|
|
|
|
n = 0
|
|
|
|
|
for char in word:
|
|
|
|
|
if char in known_chars:
|
|
|
|
|
n +=1
|
|
|
|
|
|
|
|
|
|
key_alphabet[Breaker.EN_LETTER_FREQ[0]] = most_freq
|
|
|
|
|
# skip known words or words with too many unknown
|
|
|
|
|
if n == len(word) or n / len(word) < 0.3:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
while len(key_alphabet) < 26:
|
|
|
|
|
return self.words[i]
|
|
|
|
|
|
|
|
|
|
word_pos = Breaker.get_word_containing(
|
|
|
|
|
self.word_file,
|
|
|
|
|
next(Breaker.choose_known_letters(key_alphabet))
|
|
|
|
|
)
|
|
|
|
|
def translate_and_regex(self, word: str):
|
|
|
|
|
regex = ""
|
|
|
|
|
for char in word:
|
|
|
|
|
if char in self.key.keys():
|
|
|
|
|
regex = regex + self.key[char]
|
|
|
|
|
else:
|
|
|
|
|
regex = regex + "."
|
|
|
|
|
return regex
|
|
|
|
|
|
|
|
|
|
def match_ciphertext(self, regex):
|
|
|
|
|
rx = re.compile(regex)
|
|
|
|
|
count = Counter(re.findall(self.text))
|
|
|
|
|
return count.most_common(1)[0][0]
|
|
|
|
|
|
|
|
|
|
def extract_unknown(self, plain, regex, cipher):
|
|
|
|
|
assert len(plain) == len(regex) == len(cipher)
|
|
|
|
|
|
|
|
|
|
for i in range(len(regex)):
|
|
|
|
|
if regex[i] != ".": continue
|
|
|
|
|
self.key.update({ plain[i] : cipher[i] })
|
|
|
|
|
|
|
|
|
|
return self.key
|
|
|
|
|
|
|
|
|
|
def key_to_str(self):
|
|
|
|
|
return str(self.key.keys())
|
|
|
|
|
|
|
|
|
|
def get_key(self):
|
|
|
|
|
|
|
|
|
|
pass
|
|
|
|
|
while len(self.key.keys()) < 26:
|
|
|
|
|
|
|
|
|
|
most_common = Breaker.match_ciphertext(
|
|
|
|
|
self.ciphertext,
|
|
|
|
|
word_pos,
|
|
|
|
|
most_freq
|
|
|
|
|
)
|
|
|
|
|
word = self.choose_word()
|
|
|
|
|
regex = self.translate_and_regex(word)
|
|
|
|
|
cipher = self.match_ciphertext(regex)
|
|
|
|
|
print(word, regex, cipher, self.extract_unknown())
|
|
|
|
|
|
|
|
|
|
pass
|
|
|
|
|
return self.key_to_str()
|
|
|
|
|
|
|
|
|
|
## end Breaker
|
|
|
|
|
|