diff --git a/src/mono/break_mono.py b/src/mono/break_mono.py old mode 100644 new mode 100755 index e8b7aaa..4bb44a1 --- a/src/mono/break_mono.py +++ b/src/mono/break_mono.py @@ -2,22 +2,27 @@ """ Python module to derive a key from an monoalphabetically encrypted file. +Does not work yet. """ import re - from collections import Counter -#from string import ascii_lowercase - -import pdb class Breaker(): + """ + A handle on the various bits of data needed to derive the key from the ciphertext. + """ EN_LETTER_FREQ = list("etaoinsrhdlucmfywgpbvkxqjz") @staticmethod def read_word_file(word_file): - # excuse me + """ + Helper function to read the words file into memory. The rationale was that + querying would be faster and it would be possible to delete used words + without mutating the file. + """ + words = [] with open(word_file, 'r') as wf: for line in wf: @@ -32,6 +37,7 @@ class Breaker(): def __init__(self, ciphertext, word_file): + # count chars in the ciphertext most_freq_cipher = Counter(ciphertext).most_common(1)[0][0] self.text = ciphertext @@ -39,12 +45,18 @@ class Breaker(): self.words = Breaker.read_word_file(word_file) def choose_word(self): + """ + Iterate through the word list and pick a word for pattern matching. + Words with chars that are completely known are are being removed. + Words where less then a third of the chars are known are being skipped. + """ known_chars = self.key.keys() - for i in range(len(self.words)): - word = self.words[i] + for word in self.words: + + word = word + "" # copy if len(word) == 0: - self.words.pop(i) + self.words.remove(word) continue # count known chars in word @@ -55,15 +67,22 @@ class Breaker(): # remove known words if n == len(word): - self.words.pop(i) + self.words.remove(word) continue # skip words with too many unknown chars if (n / len(word) < 0.3): continue - return self.words.pop(i) + self.words.remove(word) + return word + + return "" def translate_and_regex(self, word: str): + """ + Prepare chosen word for pattern matching. + Translate the known characters and replace the others with a regex '.' + """ regex = "" for char in word: if char in self.key.keys(): @@ -73,6 +92,11 @@ class Breaker(): return regex def match_ciphertext(self, regex): + """ + Compile the translated chosen word to a regular expression and find all + matches inside the ciphertext. Count the occurences and pick the most + frequent one. + """ regexc = re.compile(regex) count = Counter(regexc.findall(self.text)) if len(count) == 0: @@ -81,6 +105,12 @@ class Breaker(): return count.most_common(1)[0][0] def extract_unknown(self, plain, regex, cipher): + """ + Compare the the chosen words in it's various forms to infer which + new characters may be added to the alphabet map. + The dots inside `regex' symbolize the positions of unknown characters + and provide a mapping between `plain' and `cipher' text. + """ assert len(plain) == len(regex) == len(cipher) assert "." in regex @@ -91,21 +121,50 @@ class Breaker(): return self.key def key_to_str(self): - return str(self.key.keys()) + return "".join(self.key.keys()) def get_key(self): - + """ + Strings everything together. + Unfortunatly it does not work. + """ while len(self.key.keys()) < 26: word = self.choose_word() + + if word == "": break # no more words with unknown chars + regex = self.translate_and_regex(word) cipher = self.match_ciphertext(regex) if cipher == "": continue - - print(word, regex, cipher, - self.extract_unknown(word, regex, cipher)) + self.extract_unknown(word, regex, cipher) return self.key_to_str() - ## end Breaker + +if __name__ == "__main__": + import sys, os + import argparse + + # cannot import from a parent package if called directly + # without modifying PYTHONPATH or sys.path + file_dir = os.path.dirname(os.path.abspath(__file__)) + file_parent_dir = os.path.dirname(file_dir) + sys.path.append(file_parent_dir) + + from libex01 import read_text + + def parse_args(sys_argv): + parser = argparse.ArgumentParser() + parser.add_argument("FILE") + return parser.parse_args(sys_argv[1:]) + + + args = parse_args(sys.argv) + txt = read_text(args.FILE) + word_file = "common.txt" + + bm = Breaker(txt, word_file) + + print(bm.get_key())