|
|
|
|
@ -2,22 +2,27 @@
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
Python module to derive a key from an monoalphabetically encrypted file.
|
|
|
|
|
Does not work yet.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
from collections import Counter
|
|
|
|
|
#from string import ascii_lowercase
|
|
|
|
|
|
|
|
|
|
import pdb
|
|
|
|
|
|
|
|
|
|
class Breaker():
|
|
|
|
|
"""
|
|
|
|
|
A handle on the various bits of data needed to derive the key from the ciphertext.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
EN_LETTER_FREQ = list("etaoinsrhdlucmfywgpbvkxqjz")
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def read_word_file(word_file):
|
|
|
|
|
# excuse me
|
|
|
|
|
"""
|
|
|
|
|
Helper function to read the words file into memory. The rationale was that
|
|
|
|
|
querying would be faster and it would be possible to delete used words
|
|
|
|
|
without mutating the file.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
words = []
|
|
|
|
|
with open(word_file, 'r') as wf:
|
|
|
|
|
for line in wf:
|
|
|
|
|
@ -32,6 +37,7 @@ class Breaker():
|
|
|
|
|
|
|
|
|
|
def __init__(self, ciphertext, word_file):
|
|
|
|
|
|
|
|
|
|
# count chars in the ciphertext
|
|
|
|
|
most_freq_cipher = Counter(ciphertext).most_common(1)[0][0]
|
|
|
|
|
|
|
|
|
|
self.text = ciphertext
|
|
|
|
|
@ -39,12 +45,18 @@ class Breaker():
|
|
|
|
|
self.words = Breaker.read_word_file(word_file)
|
|
|
|
|
|
|
|
|
|
def choose_word(self):
|
|
|
|
|
"""
|
|
|
|
|
Iterate through the word list and pick a word for pattern matching.
|
|
|
|
|
Words with chars that are completely known are are being removed.
|
|
|
|
|
Words where less then a third of the chars are known are being skipped.
|
|
|
|
|
"""
|
|
|
|
|
known_chars = self.key.keys()
|
|
|
|
|
for i in range(len(self.words)):
|
|
|
|
|
word = self.words[i]
|
|
|
|
|
for word in self.words:
|
|
|
|
|
|
|
|
|
|
word = word + "" # copy
|
|
|
|
|
|
|
|
|
|
if len(word) == 0:
|
|
|
|
|
self.words.pop(i)
|
|
|
|
|
self.words.remove(word)
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# count known chars in word
|
|
|
|
|
@ -55,15 +67,22 @@ class Breaker():
|
|
|
|
|
|
|
|
|
|
# remove known words
|
|
|
|
|
if n == len(word):
|
|
|
|
|
self.words.pop(i)
|
|
|
|
|
self.words.remove(word)
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# skip words with too many unknown chars
|
|
|
|
|
if (n / len(word) < 0.3): continue
|
|
|
|
|
|
|
|
|
|
return self.words.pop(i)
|
|
|
|
|
self.words.remove(word)
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
def translate_and_regex(self, word: str):
|
|
|
|
|
"""
|
|
|
|
|
Prepare chosen word for pattern matching.
|
|
|
|
|
Translate the known characters and replace the others with a regex '.'
|
|
|
|
|
"""
|
|
|
|
|
regex = ""
|
|
|
|
|
for char in word:
|
|
|
|
|
if char in self.key.keys():
|
|
|
|
|
@ -73,6 +92,11 @@ class Breaker():
|
|
|
|
|
return regex
|
|
|
|
|
|
|
|
|
|
def match_ciphertext(self, regex):
|
|
|
|
|
"""
|
|
|
|
|
Compile the translated chosen word to a regular expression and find all
|
|
|
|
|
matches inside the ciphertext. Count the occurences and pick the most
|
|
|
|
|
frequent one.
|
|
|
|
|
"""
|
|
|
|
|
regexc = re.compile(regex)
|
|
|
|
|
count = Counter(regexc.findall(self.text))
|
|
|
|
|
if len(count) == 0:
|
|
|
|
|
@ -81,6 +105,12 @@ class Breaker():
|
|
|
|
|
return count.most_common(1)[0][0]
|
|
|
|
|
|
|
|
|
|
def extract_unknown(self, plain, regex, cipher):
|
|
|
|
|
"""
|
|
|
|
|
Compare the the chosen words in it's various forms to infer which
|
|
|
|
|
new characters may be added to the alphabet map.
|
|
|
|
|
The dots inside `regex' symbolize the positions of unknown characters
|
|
|
|
|
and provide a mapping between `plain' and `cipher' text.
|
|
|
|
|
"""
|
|
|
|
|
assert len(plain) == len(regex) == len(cipher)
|
|
|
|
|
assert "." in regex
|
|
|
|
|
|
|
|
|
|
@ -91,21 +121,50 @@ class Breaker():
|
|
|
|
|
return self.key
|
|
|
|
|
|
|
|
|
|
def key_to_str(self):
|
|
|
|
|
return str(self.key.keys())
|
|
|
|
|
return "".join(self.key.keys())
|
|
|
|
|
|
|
|
|
|
def get_key(self):
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
Strings everything together.
|
|
|
|
|
Unfortunatly it does not work.
|
|
|
|
|
"""
|
|
|
|
|
while len(self.key.keys()) < 26:
|
|
|
|
|
|
|
|
|
|
word = self.choose_word()
|
|
|
|
|
|
|
|
|
|
if word == "": break # no more words with unknown chars
|
|
|
|
|
|
|
|
|
|
regex = self.translate_and_regex(word)
|
|
|
|
|
cipher = self.match_ciphertext(regex)
|
|
|
|
|
|
|
|
|
|
if cipher == "": continue
|
|
|
|
|
|
|
|
|
|
print(word, regex, cipher,
|
|
|
|
|
self.extract_unknown(word, regex, cipher))
|
|
|
|
|
self.extract_unknown(word, regex, cipher)
|
|
|
|
|
|
|
|
|
|
return self.key_to_str()
|
|
|
|
|
|
|
|
|
|
## end Breaker
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
import sys, os
|
|
|
|
|
import argparse
|
|
|
|
|
|
|
|
|
|
# cannot import from a parent package if called directly
|
|
|
|
|
# without modifying PYTHONPATH or sys.path
|
|
|
|
|
file_dir = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
|
file_parent_dir = os.path.dirname(file_dir)
|
|
|
|
|
sys.path.append(file_parent_dir)
|
|
|
|
|
|
|
|
|
|
from libex01 import read_text
|
|
|
|
|
|
|
|
|
|
def parse_args(sys_argv):
|
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
|
parser.add_argument("FILE")
|
|
|
|
|
return parser.parse_args(sys_argv[1:])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
args = parse_args(sys.argv)
|
|
|
|
|
txt = read_text(args.FILE)
|
|
|
|
|
word_file = "common.txt"
|
|
|
|
|
|
|
|
|
|
bm = Breaker(txt, word_file)
|
|
|
|
|
|
|
|
|
|
print(bm.get_key())
|
|
|
|
|
|