Merge branch 'break_mono'

2020-11-25 03:56:44 +01:00
parent cf105c397f ed3d03d599
commit ecb3157a91
1 changed files with 170 additions and 0 deletions
--- a/src/mono/break_mono.py
+++ b/src/mono/break_mono.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python
+
+"""
+Python module to derive a key from an monoalphabetically encrypted file.
+Does not work yet.
+"""
+
+import re
+from collections import Counter
+
+class Breaker():
+    """
+    A handle on the various bits of data needed to derive the key from the ciphertext.
+    """
+
+    EN_LETTER_FREQ = list("etaoinsrhdlucmfywgpbvkxqjz")
+
+    @staticmethod
+    def read_word_file(word_file):
+        """
+        Helper function to read the words file into memory. The rationale was that
+        querying would be faster and it would be possible to delete used words
+        without mutating the file.
+        """
+
+        words = []
+        with open(word_file, 'r') as wf:
+            for line in wf:
+                word = line[:-1] # remove trailing newline
+                word = word.lower()
+
+                if word == "" or not word.isalpha():
+                    continue
+
+                words.append(word)
+        return words
+
+    def __init__(self, ciphertext, word_file):
+
+        # count chars in the ciphertext
+        most_freq_cipher = Counter(ciphertext).most_common(1)[0][0]
+
+        self.text = ciphertext
+        self.key = { Breaker.EN_LETTER_FREQ[0] : most_freq_cipher }
+        self.words = Breaker.read_word_file(word_file)
+
+    def choose_word(self):
+        """
+        Iterate through the word list and pick a word for pattern matching.
+        Words with chars that are completely known are are being removed.
+        Words where less then a third of the chars are known are being skipped.
+        """
+        known_chars = self.key.keys()
+        for word in self.words:
+
+            word = word + ""  # copy
+
+            if len(word) == 0:
+                self.words.remove(word)
+                continue
+
+            # count known chars in word
+            n = 0
+            for char in word:
+                if char in known_chars:
+                    n +=1
+
+            # remove known words
+            if n == len(word):
+                self.words.remove(word)
+                continue
+
+            # skip words with too many unknown chars
+            if (n / len(word) < 0.3): continue
+
+            self.words.remove(word)
+            return word
+
+        return ""
+
+    def translate_and_regex(self, word: str):
+        """
+        Prepare chosen word for pattern matching.
+        Translate the known characters and replace the others with a regex '.'
+        """
+        regex = ""
+        for char in word:
+            if char in self.key.keys():
+                regex = regex + self.key[char]
+            else:
+                regex = regex + "."
+        return regex
+
+    def match_ciphertext(self, regex):
+        """
+        Compile the translated chosen word to a regular expression and find all
+        matches inside the ciphertext. Count the occurences and pick the most
+        frequent one.
+        """
+        regexc = re.compile(regex)
+        count = Counter(regexc.findall(self.text))
+        if len(count) == 0:
+            return ""
+        else:
+            return count.most_common(1)[0][0]
+
+    def extract_unknown(self, plain, regex, cipher):
+        """
+        Compare the the chosen words in it's various forms to infer which
+        new characters may be added to the alphabet map.
+        The dots inside `regex' symbolize the positions of unknown characters
+        and provide a mapping between `plain' and `cipher' text.
+        """
+        assert len(plain) == len(regex) == len(cipher)
+        assert "." in regex
+
+        for i in range(len(regex)):
+            if regex[i] != ".": continue
+            self.key.update({ plain[i] : cipher[i] })
+
+        return self.key
+
+    def key_to_str(self):
+        return "".join(self.key.keys())
+
+    def get_key(self):
+        """
+        Strings everything together.
+        Unfortunatly it does not work.
+        """
+        while len(self.key.keys()) < 26:
+
+            word   = self.choose_word()
+
+            if word == "": break  # no more words with unknown chars
+
+            regex  = self.translate_and_regex(word)
+            cipher = self.match_ciphertext(regex)
+
+            if cipher == "": continue
+            self.extract_unknown(word, regex, cipher)
+
+        return self.key_to_str()
+    ## end Breaker
+
+if __name__ == "__main__":
+    import sys, os
+    import argparse
+
+    # cannot import from a parent package if called directly
+    # without modifying PYTHONPATH or sys.path
+    file_dir = os.path.dirname(os.path.abspath(__file__))
+    file_parent_dir = os.path.dirname(file_dir)
+    sys.path.append(file_parent_dir)
+
+    from libex01 import read_text
+
+    def parse_args(sys_argv):
+        parser = argparse.ArgumentParser()
+        parser.add_argument("FILE")
+        return parser.parse_args(sys_argv[1:])
+
+
+    args = parse_args(sys.argv)
+    txt = read_text(args.FILE)
+    word_file = "common.txt"
+
+    bm = Breaker(txt, word_file)
+
+    print(bm.get_key())