diff --git a/src/mono/break_mono.py b/src/mono/break_mono.py new file mode 100755 index 0000000..4bb44a1 --- /dev/null +++ b/src/mono/break_mono.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python + +""" +Python module to derive a key from an monoalphabetically encrypted file. +Does not work yet. +""" + +import re +from collections import Counter + +class Breaker(): + """ + A handle on the various bits of data needed to derive the key from the ciphertext. + """ + + EN_LETTER_FREQ = list("etaoinsrhdlucmfywgpbvkxqjz") + + @staticmethod + def read_word_file(word_file): + """ + Helper function to read the words file into memory. The rationale was that + querying would be faster and it would be possible to delete used words + without mutating the file. + """ + + words = [] + with open(word_file, 'r') as wf: + for line in wf: + word = line[:-1] # remove trailing newline + word = word.lower() + + if word == "" or not word.isalpha(): + continue + + words.append(word) + return words + + def __init__(self, ciphertext, word_file): + + # count chars in the ciphertext + most_freq_cipher = Counter(ciphertext).most_common(1)[0][0] + + self.text = ciphertext + self.key = { Breaker.EN_LETTER_FREQ[0] : most_freq_cipher } + self.words = Breaker.read_word_file(word_file) + + def choose_word(self): + """ + Iterate through the word list and pick a word for pattern matching. + Words with chars that are completely known are are being removed. + Words where less then a third of the chars are known are being skipped. + """ + known_chars = self.key.keys() + for word in self.words: + + word = word + "" # copy + + if len(word) == 0: + self.words.remove(word) + continue + + # count known chars in word + n = 0 + for char in word: + if char in known_chars: + n +=1 + + # remove known words + if n == len(word): + self.words.remove(word) + continue + + # skip words with too many unknown chars + if (n / len(word) < 0.3): continue + + self.words.remove(word) + return word + + return "" + + def translate_and_regex(self, word: str): + """ + Prepare chosen word for pattern matching. + Translate the known characters and replace the others with a regex '.' + """ + regex = "" + for char in word: + if char in self.key.keys(): + regex = regex + self.key[char] + else: + regex = regex + "." + return regex + + def match_ciphertext(self, regex): + """ + Compile the translated chosen word to a regular expression and find all + matches inside the ciphertext. Count the occurences and pick the most + frequent one. + """ + regexc = re.compile(regex) + count = Counter(regexc.findall(self.text)) + if len(count) == 0: + return "" + else: + return count.most_common(1)[0][0] + + def extract_unknown(self, plain, regex, cipher): + """ + Compare the the chosen words in it's various forms to infer which + new characters may be added to the alphabet map. + The dots inside `regex' symbolize the positions of unknown characters + and provide a mapping between `plain' and `cipher' text. + """ + assert len(plain) == len(regex) == len(cipher) + assert "." in regex + + for i in range(len(regex)): + if regex[i] != ".": continue + self.key.update({ plain[i] : cipher[i] }) + + return self.key + + def key_to_str(self): + return "".join(self.key.keys()) + + def get_key(self): + """ + Strings everything together. + Unfortunatly it does not work. + """ + while len(self.key.keys()) < 26: + + word = self.choose_word() + + if word == "": break # no more words with unknown chars + + regex = self.translate_and_regex(word) + cipher = self.match_ciphertext(regex) + + if cipher == "": continue + self.extract_unknown(word, regex, cipher) + + return self.key_to_str() + ## end Breaker + +if __name__ == "__main__": + import sys, os + import argparse + + # cannot import from a parent package if called directly + # without modifying PYTHONPATH or sys.path + file_dir = os.path.dirname(os.path.abspath(__file__)) + file_parent_dir = os.path.dirname(file_dir) + sys.path.append(file_parent_dir) + + from libex01 import read_text + + def parse_args(sys_argv): + parser = argparse.ArgumentParser() + parser.add_argument("FILE") + return parser.parse_args(sys_argv[1:]) + + + args = parse_args(sys.argv) + txt = read_text(args.FILE) + word_file = "common.txt" + + bm = Breaker(txt, word_file) + + print(bm.get_key())