break_mono.py wird komplett umgeschrieben

2020-11-24 23:52:14 +01:00
parent 3331177041
commit 12cca20112
1 changed files with 65 additions and 104 deletions
--- a/src/mono/break_mono.py
+++ b/src/mono/break_mono.py
@@ -3,125 +3,86 @@
 """
 Python module to derive a key from an monoalphabetically encrypted file.
 """
-import itertools as it
+
 import re

-from collections import OrderedDict
 from collections import Counter
-from string import ascii_lowercase
+


 class Breaker():

-    ## frequency analysis
+
    EN_LETTER_FREQ = list("etaoinsrhdlucmfywgpbvkxqjz")

-    @staticmethod
-    def get_frequency(text):
-        freq = Counter(text)
-        # Counter with lowercase ascii letters all having a count of 0
-        missing = Counter(dict(it.product(ascii_lowercase, [0])))
-        freq.update(missing)
-        return freq
-
-
-    @staticmethod
-    def derive_alphabet_freq(freq: Counter):
-        most_freq = [ item[0] for item in freq.most_common() ]
-        #return OrderedDict(zip(EN_LETTER_FREQ, most_freq))
-        return OrderedDict(zip(Breaker.EN_LETTER_FREQ, most_freq))
-
-
-    ## pattern matching
-    @staticmethod
-    def get_word_containing(word_file, char_list: list):
-        """
-        Find word from a word list file (common.txt) containing the chars
-        in `char_list'.
-        Return None it no word matches or
-        TUPLE(word, pos) where `pos' is a LIST of matching positions.
-        """
-        with open(word_file, 'r') as f:
-            for line in f:
-                word = line[:-1]
-                pos = []
-                for char in char_list:
-                    pos.append(word.find(char))
-                if -1 not in pos:
-                    return word, pos
-        return None, None
-
-    @staticmethod
-    def positions(text: str, sub):
-        index = text.find(sub)
-        while index != -1:
-            yield index
-            index = text.find(sub, index + 1)
-        return index
-
-
-    @staticmethod
-    def match_ciphertext(text: str, word_pos: tuple, char: tuple):
-        """
-        asdf
-        """
-        word, wposl = word_pos
-        wpos = wposl[0]
-        wlen = len(word)
-
-        snip_count = Counter()
-        for pos in Breaker.positions(text, char):
-            word_begin = pos - wpos
-            snippet = text[word_begin : word_begin + wlen]
-
-            if snippet not in snip_count.elements():
-                snip_count[snippet] = text.count(snippet)
-
-        return snip_count.most_common(1)[0][0]
-
-    @staticmethod
-    def choose_known_letters(key_alphabet):
-        letters = list(key_alphabet.keys())
-        if len(key_alphabet) < 3:
-            yield letters
-        else:
-            for i in letters:
-                for j in letters:
-                    for k in letters:
-                        if k == j or k == i or j == i: continue
-                        yield [i, j, k]
-        return None
-
    def __init__(self, ciphertext, word_file):
-        self.ciphertext = ciphertext
-        self.alph = self.derive_alphabet_freq(self.get_frequency(ciphertext))
-        self.word_file = word_file

+        most_freq_cipher = Counter(ciphertext).most_common(1)[0][0]
+
+        # excuse me:
+        words = []
+        with open(word_file, 'r') as wf:
+            for line in wf:
+                words.append(line[:-1])  # remove trailing newline and append
+
+        self.text = ciphertext
+        self.key = { Breaker.EN_LETTER_FREQ[0] : most_freq_cipher }
+        self.words = words
+
+    def choose_word(self):
+        known_chars = self.key.keys()
+        for i in range(len(self.words)):
+            word = self.words[i]
+
+            if len(word) == 0: continue
+
+            # count known chars in word
+            n = 0
+            for char in word:
+                if char in known_chars:
+                    n +=1
+
+            # skip known words or words with too many unknown
+            if n == len(word) or n / len(word) < 0.3:
+                continue
+
+            return self.words[i]
+
+    def translate_and_regex(self, word: str):
+        regex = ""
+        for char in word:
+            if char in self.key.keys():
+                regex = regex + self.key[char]
+            else:
+                regex = regex + "."
+        return regex
+
+    def match_ciphertext(self, regex):
+        rx = re.compile(regex)
+        count = Counter(re.findall(self.text))
+        return count.most_common(1)[0][0]
+
+    def extract_unknown(self, plain, regex, cipher):
+        assert len(plain) == len(regex) == len(cipher)
+
+        for i in range(len(regex)):
+            if regex[i] != ".": continue
+            self.key.update({ plain[i] : cipher[i] })
+
+        return self.key
+
+    def key_to_str(self):
+        return str(self.key.keys())

    def get_key(self):

-        key_alphabet = OrderedDict()
+        while len(self.key.keys()) < 26:

-        # most frequent char in English and corresponding most common char in text
-        most_freq = self.alph[Breaker.EN_LETTER_FREQ]
+            word   = self.choose_word()
+            regex  = self.translate_and_regex(word)
+            cipher = self.match_ciphertext(regex)
+            print(word, regex, cipher, self.extract_unknown())

-        key_alphabet[Breaker.EN_LETTER_FREQ[0]] = most_freq
-
-        while len(key_alphabet) < 26:
-
-            word_pos = Breaker.get_word_containing(
-                self.word_file,
-                next(Breaker.choose_known_letters(key_alphabet))
-            )
-
-            pass
-
-            most_common = Breaker.match_ciphertext(
-                self.ciphertext,
-                word_pos,
-                most_freq
-            )
-
-        pass
+        return self.key_to_str()

    ## end Breaker