From 5355e25ba37c4d229b8e96d0819c1517b69c3f3c Mon Sep 17 00:00:00 2001 From: Daniel Tschertkow Date: Tue, 24 Nov 2020 13:12:25 +0100 Subject: [PATCH 1/7] =?UTF-8?q?F=C3=BCgt=20aktuellen=20Zustand=20von=20bre?= =?UTF-8?q?ak=5Fmono.py=20hinzu?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mono/break_mono.py | 117 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 src/mono/break_mono.py diff --git a/src/mono/break_mono.py b/src/mono/break_mono.py new file mode 100644 index 0000000..ddc7995 --- /dev/null +++ b/src/mono/break_mono.py @@ -0,0 +1,117 @@ +""" +Python module to derive a key from an monoalphabetically encrypted file. +""" +import itertools as it +import re + +from collections import OrderedDict +from collections import Counter +from string import ascii_lowercase + +#from libex01 import read_text + +def mono_break(enc_txt: str): + + EN_LETTER_FREQ = list("etaoinsrhdlucmfywgpbvkxqjz") + + def get_frequency(text): + freq = Counter(text) + # Counter with lowercase ascii letters all having a count of 0 + missing = Counter(dict(it.product(ascii_lowercase, [0]))) + freq.update(missing) + return freq + + def derive_alphabet(freq: Counter): + return OrderedDict(zip(list(freq.keys()), EN_LETTER_FREQ)) + + subs = derive_alphabet(get_frequency(enc_txt)) + dec = "" + for char in enc_txt: + dec += subs[char] + return dec + +## Frequenzanalyse +EN_LETTER_FREQ = list("etaoinsrhdlucmfywgpbvkxqjz") + +def get_frequency(text): + freq = Counter(text) + # Counter with lowercase ascii letters all having a count of 0 + missing = Counter(dict(it.product(ascii_lowercase, [0]))) + freq.update(missing) + return freq + +def derive_alphabet_freq(freq: Counter): + most_freq = [ item[0] for item in freq.most_common() ] + #return OrderedDict(zip(EN_LETTER_FREQ, most_freq)) + return OrderedDict(zip(EN_LETTER_FREQ, most_freq)) + +## Mustersuche +def next_char_anchor(text: str, char): + """ + Generator that takes the text and a char and yields positions of that char. + Adjust starting position by slicing. + Return generator closure. + """ + index = text.find(char) + while index != -1: + yield (index, char) + index = text.find(char, index + 1) + return (index, char) + + +def match_word(text: str, word: str, pos_iter): + """ + Align with anchor and check hypothesis. + First hypothesis is the frequency analysis. + Align `word' with `text' for each anchor. + For remaining anchors check the occurence of pattern + and match with word. If it is true more than once, save it as a + new hypothesis. + """ + pass + +def next_match(text: str, word: str, alphabet: dict, pos_iter: generator): + pass + +class Breaker(): + + EN_LETTER_FREQ = list("etaoinsrhdlucmfywgpbvkxqjz") + + def __init__(self, ciphertext, word_file): + self.alph = derive_alphabet_freq(get_frequency(ciphertext)) + self.word_file = None # TODO + + def get_key(self): + + def get_word_containing(char_list: list): + """ + Find word from a word list file (common.txt) containing the chars + in `char_list'. + Return None it no word matches or + TUPLE(word, pos) where `pos' is a LIST of matching positions. + """ + with open(self.word_file, 'r') as f: + for word in f: + pos = [] + for char in char_list: + pos += word.find(char) + if -1 not in pos: + return word, pos + return None + + most_freq = next(iter(self.alph)) # most frequent char + word_having_char, pos = get_word_containing(most_freq) # unpack + + + + pass + + + +text = "gryticdettpjcjgtmtntajgryticdkrqstmkojgjgtmtrmjgtatnrgdpmatmjgcdnrpmhgoqmatpetopzsoqqtmfcajgtoaeatrmrpmjgtfozdatnmcpjqotjgtinltrscfgdwrpmontrntedjktmckgrjktkrpjrpmktjgopskgrjktlqtrntgryticdqoytmjgttultaotphtgryticdkojptnntmjgtlqrzdtltclqtwrsopzereotnncwtjowtnxdnjjctnhrltopjgonqrpmcfhcwltjojocpjgthcwlrnnocponzcptitjktozpcatjgtpttmirpmktsttlldngopzcpktsttlldngopzcpjgononxdnjrldpsachsncpzkaojjtpfcajgtltclqtkgchrpnttncwtjgopznkacpzqostrpjnoprhcqcpiktmccdangratedjjgtatnncwrpicjgtafdhsopopnthjncdjjgtatrpmjgononxdnjrldpsachsncpzqostkcastanoprfrhjcaiktmccdangratedjjgtatnncwrpicjgtafdhsopacecjncdjjgtatgryticdyonojtmjgtbdrzwoatgryticdnkrwopjgtngojjgtlrajihcpytpjocpnrpmjgtatrqlcqojosjgtfrhtnrqkrinmofftatpjjgtagtjcaohjgtnrwtedjktnkrqqckojrpmktnttpcjgopzhgrpztpcjgopzgrnhgrpztmjtpwoqqocpmcqqrancprqcnopzhrwlrozpjktpjiwoqqocpnjrayopzrpmkaojgopzoplropeoznjacpzltclqtdpkoqqopzjczoytnwrqqopyonocprpmltanlthjoytcptopfoytsomnetqckjgtlcytajiqoptcptlcldqrjocpadppopcdjcfjowt" + + +freq = get_frequency(text) +alph = derive_alphabet(freq) + +print(alph.values()) From cde1cadba1c35afc5804c3fbeb7d6d97b5f12169 Mon Sep 17 00:00:00 2001 From: Daniel Tschertkow Date: Tue, 24 Nov 2020 13:28:00 +0100 Subject: [PATCH 2/7] =?UTF-8?q?R=C3=A4umt=20break=5Fmono.py=20auf?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mono/break_mono.py | 104 ++++++++++++++----------------------------------- 1 file changed, 29 insertions(+), 75 deletions(-) diff --git a/src/mono/break_mono.py b/src/mono/break_mono.py index ddc7995..58f42c0 100644 --- a/src/mono/break_mono.py +++ b/src/mono/break_mono.py @@ -8,12 +8,13 @@ from collections import OrderedDict from collections import Counter from string import ascii_lowercase -#from libex01 import read_text -def mono_break(enc_txt: str): +class Breaker(): + ## frequency analysis EN_LETTER_FREQ = list("etaoinsrhdlucmfywgpbvkxqjz") + @staticmethod def get_frequency(text): freq = Counter(text) # Counter with lowercase ascii letters all having a count of 0 @@ -21,88 +22,41 @@ def mono_break(enc_txt: str): freq.update(missing) return freq - def derive_alphabet(freq: Counter): - return OrderedDict(zip(list(freq.keys()), EN_LETTER_FREQ)) - - subs = derive_alphabet(get_frequency(enc_txt)) - dec = "" - for char in enc_txt: - dec += subs[char] - return dec - -## Frequenzanalyse -EN_LETTER_FREQ = list("etaoinsrhdlucmfywgpbvkxqjz") - -def get_frequency(text): - freq = Counter(text) - # Counter with lowercase ascii letters all having a count of 0 - missing = Counter(dict(it.product(ascii_lowercase, [0]))) - freq.update(missing) - return freq - -def derive_alphabet_freq(freq: Counter): - most_freq = [ item[0] for item in freq.most_common() ] - #return OrderedDict(zip(EN_LETTER_FREQ, most_freq)) - return OrderedDict(zip(EN_LETTER_FREQ, most_freq)) - -## Mustersuche -def next_char_anchor(text: str, char): - """ - Generator that takes the text and a char and yields positions of that char. - Adjust starting position by slicing. - Return generator closure. - """ - index = text.find(char) - while index != -1: - yield (index, char) - index = text.find(char, index + 1) - return (index, char) - - -def match_word(text: str, word: str, pos_iter): - """ - Align with anchor and check hypothesis. - First hypothesis is the frequency analysis. - Align `word' with `text' for each anchor. - For remaining anchors check the occurence of pattern - and match with word. If it is true more than once, save it as a - new hypothesis. - """ - pass - -def next_match(text: str, word: str, alphabet: dict, pos_iter: generator): - pass -class Breaker(): + @staticmethod + def derive_alphabet_freq(freq: Counter): + most_freq = [ item[0] for item in freq.most_common() ] + #return OrderedDict(zip(EN_LETTER_FREQ, most_freq)) + return OrderedDict(zip(EN_LETTER_FREQ, most_freq)) + + + ## pattern matching + @staticmethod + def get_word_containing(word_file, char_list: list): + """ + Find word from a word list file (common.txt) containing the chars + in `char_list'. + Return None it no word matches or + TUPLE(word, pos) where `pos' is a LIST of matching positions. + """ + with open(word_file, 'r') as f: + for word in f: + pos = [] + for char in char_list: + pos += word.find(char) + if -1 not in pos: + return word, pos + return None - EN_LETTER_FREQ = list("etaoinsrhdlucmfywgpbvkxqjz") def __init__(self, ciphertext, word_file): self.alph = derive_alphabet_freq(get_frequency(ciphertext)) self.word_file = None # TODO - def get_key(self): - - def get_word_containing(char_list: list): - """ - Find word from a word list file (common.txt) containing the chars - in `char_list'. - Return None it no word matches or - TUPLE(word, pos) where `pos' is a LIST of matching positions. - """ - with open(self.word_file, 'r') as f: - for word in f: - pos = [] - for char in char_list: - pos += word.find(char) - if -1 not in pos: - return word, pos - return None + def get_key(self): most_freq = next(iter(self.alph)) # most frequent char - word_having_char, pos = get_word_containing(most_freq) # unpack - - + word_having_char, pos = get_word_containing(self.word_file, most_freq) pass From e3f854fa0c6581bb0402fe1fc0a1f0c98476e276 Mon Sep 17 00:00:00 2001 From: Daniel Tschertkow Date: Tue, 24 Nov 2020 19:18:14 +0100 Subject: [PATCH 3/7] =?UTF-8?q?f=C3=BCgt=20match=5Fciphertext=20in=20break?= =?UTF-8?q?=5Fmono=20hinzu?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mono/break_mono.py | 63 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 14 deletions(-) diff --git a/src/mono/break_mono.py b/src/mono/break_mono.py index 58f42c0..a362a8f 100644 --- a/src/mono/break_mono.py +++ b/src/mono/break_mono.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + """ Python module to derive a key from an monoalphabetically encrypted file. """ @@ -27,7 +29,7 @@ class Breaker(): def derive_alphabet_freq(freq: Counter): most_freq = [ item[0] for item in freq.most_common() ] #return OrderedDict(zip(EN_LETTER_FREQ, most_freq)) - return OrderedDict(zip(EN_LETTER_FREQ, most_freq)) + return OrderedDict(zip(Breaker.EN_LETTER_FREQ, most_freq)) ## pattern matching @@ -40,32 +42,65 @@ class Breaker(): TUPLE(word, pos) where `pos' is a LIST of matching positions. """ with open(word_file, 'r') as f: - for word in f: + for line in f: + word = line[:-1] pos = [] for char in char_list: - pos += word.find(char) + pos.append(word.find(char)) if -1 not in pos: return word, pos - return None + return None, None + + @staticmethod + def positions(text: str, sub): + index = text.find(sub) + while index != -1: + yield index + index = text.find(sub, index + 1) + return index + @staticmethod + def match_ciphertext(text: str, word_pos: tuple, char: tuple): + word, wposl = word_pos + wpos = wposl[0] + wlen = len(word) + + snip_count = Counter() + for pos in Breaker.positions(text, char): + word_begin = pos - wpos + snippet = text[word_begin : word_begin + wlen] + + if snippet not in snip_count.elements(): + snip_count[snippet] = text.count(snippet) + + return snip_count.most_common(1)[0][0] + def __init__(self, ciphertext, word_file): - self.alph = derive_alphabet_freq(get_frequency(ciphertext)) - self.word_file = None # TODO + self.ciphertext = ciphertext + self.alph = self.derive_alphabet_freq(self.get_frequency(ciphertext)) + self.word_file = word_file def get_key(self): - most_freq = next(iter(self.alph)) # most frequent char - word_having_char, pos = get_word_containing(self.word_file, most_freq) - - pass -text = "gryticdettpjcjgtmtntajgryticdkrqstmkojgjgtmtrmjgtatnrgdpmatmjgcdnrpmhgoqmatpetopzsoqqtmfcajgtoaeatrmrpmjgtfozdatnmcpjqotjgtinltrscfgdwrpmontrntedjktmckgrjktkrpjrpmktjgopskgrjktlqtrntgryticdqoytmjgttultaotphtgryticdkojptnntmjgtlqrzdtltclqtwrsopzereotnncwtjowtnxdnjjctnhrltopjgonqrpmcfhcwltjojocpjgthcwlrnnocponzcptitjktozpcatjgtpttmirpmktsttlldngopzcpktsttlldngopzcpjgononxdnjrldpsachsncpzkaojjtpfcajgtltclqtkgchrpnttncwtjgopznkacpzqostrpjnoprhcqcpiktmccdangratedjjgtatnncwrpicjgtafdhsopopnthjncdjjgtatrpmjgononxdnjrldpsachsncpzqostkcastanoprfrhjcaiktmccdangratedjjgtatnncwrpicjgtafdhsopacecjncdjjgtatgryticdyonojtmjgtbdrzwoatgryticdnkrwopjgtngojjgtlrajihcpytpjocpnrpmjgtatrqlcqojosjgtfrhtnrqkrinmofftatpjjgtagtjcaohjgtnrwtedjktnkrqqckojrpmktnttpcjgopzhgrpztpcjgopzgrnhgrpztmjtpwoqqocpmcqqrancprqcnopzhrwlrozpjktpjiwoqqocpnjrayopzrpmkaojgopzoplropeoznjacpzltclqtdpkoqqopzjczoytnwrqqopyonocprpmltanlthjoytcptopfoytsomnetqckjgtlcytajiqoptcptlcldqrjocpadppopcdjcfjowt" + # most frequent char in English and corresponding most common char in text + #most_freq = self.alph.popitem(last=False) + most_freq = next(iter(self.alph)) + word_pos = Breaker.get_word_containing( + self.word_file, + #most_freq[0] + most_freq + ) + most_common = Breaker.match_ciphertext( + self.ciphertext, + word_pos, + most_freq + ) + print("most_freq", most_freq, "word_pos:", word_pos, "most_common:", most_common) -freq = get_frequency(text) -alph = derive_alphabet(freq) -print(alph.values()) + ## end Breaker From 3331177041c77c990b0d345d84e130d0519dcba1 Mon Sep 17 00:00:00 2001 From: Daniel Tschertkow Date: Tue, 24 Nov 2020 21:41:23 +0100 Subject: [PATCH 4/7] =?UTF-8?q?f=C3=BCgt=20choose=5Fknown=5Fletters=20hinz?= =?UTF-8?q?u.=20Vor=20rewrite=20match=5Fciphertext?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mono/break_mono.py | 51 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/src/mono/break_mono.py b/src/mono/break_mono.py index a362a8f..a1f94c2 100644 --- a/src/mono/break_mono.py +++ b/src/mono/break_mono.py @@ -62,6 +62,9 @@ class Breaker(): @staticmethod def match_ciphertext(text: str, word_pos: tuple, char: tuple): + """ + asdf + """ word, wposl = word_pos wpos = wposl[0] wlen = len(word) @@ -76,6 +79,19 @@ class Breaker(): return snip_count.most_common(1)[0][0] + @staticmethod + def choose_known_letters(key_alphabet): + letters = list(key_alphabet.keys()) + if len(key_alphabet) < 3: + yield letters + else: + for i in letters: + for j in letters: + for k in letters: + if k == j or k == i or j == i: continue + yield [i, j, k] + return None + def __init__(self, ciphertext, word_file): self.ciphertext = ciphertext self.alph = self.derive_alphabet_freq(self.get_frequency(ciphertext)) @@ -84,23 +100,28 @@ class Breaker(): def get_key(self): - + key_alphabet = OrderedDict() # most frequent char in English and corresponding most common char in text - #most_freq = self.alph.popitem(last=False) - most_freq = next(iter(self.alph)) - word_pos = Breaker.get_word_containing( - self.word_file, - #most_freq[0] - most_freq - ) - most_common = Breaker.match_ciphertext( - self.ciphertext, - word_pos, - most_freq - ) - - print("most_freq", most_freq, "word_pos:", word_pos, "most_common:", most_common) + most_freq = self.alph[Breaker.EN_LETTER_FREQ] + + key_alphabet[Breaker.EN_LETTER_FREQ[0]] = most_freq + + while len(key_alphabet) < 26: + + word_pos = Breaker.get_word_containing( + self.word_file, + next(Breaker.choose_known_letters(key_alphabet)) + ) + + pass + + most_common = Breaker.match_ciphertext( + self.ciphertext, + word_pos, + most_freq + ) + pass ## end Breaker From 12cca20112701e9f7e101c319e52b15fe562b150 Mon Sep 17 00:00:00 2001 From: Daniel Tschertkow Date: Tue, 24 Nov 2020 23:52:14 +0100 Subject: [PATCH 5/7] break_mono.py wird komplett umgeschrieben --- src/mono/break_mono.py | 161 +++++++++++++++++++------------------------------ 1 file changed, 61 insertions(+), 100 deletions(-) diff --git a/src/mono/break_mono.py b/src/mono/break_mono.py index a1f94c2..5cb2840 100644 --- a/src/mono/break_mono.py +++ b/src/mono/break_mono.py @@ -3,125 +3,86 @@ """ Python module to derive a key from an monoalphabetically encrypted file. """ -import itertools as it + import re -from collections import OrderedDict from collections import Counter -from string import ascii_lowercase + class Breaker(): - ## frequency analysis - EN_LETTER_FREQ = list("etaoinsrhdlucmfywgpbvkxqjz") - @staticmethod - def get_frequency(text): - freq = Counter(text) - # Counter with lowercase ascii letters all having a count of 0 - missing = Counter(dict(it.product(ascii_lowercase, [0]))) - freq.update(missing) - return freq - - - @staticmethod - def derive_alphabet_freq(freq: Counter): - most_freq = [ item[0] for item in freq.most_common() ] - #return OrderedDict(zip(EN_LETTER_FREQ, most_freq)) - return OrderedDict(zip(Breaker.EN_LETTER_FREQ, most_freq)) - - - ## pattern matching - @staticmethod - def get_word_containing(word_file, char_list: list): - """ - Find word from a word list file (common.txt) containing the chars - in `char_list'. - Return None it no word matches or - TUPLE(word, pos) where `pos' is a LIST of matching positions. - """ - with open(word_file, 'r') as f: - for line in f: - word = line[:-1] - pos = [] - for char in char_list: - pos.append(word.find(char)) - if -1 not in pos: - return word, pos - return None, None - - @staticmethod - def positions(text: str, sub): - index = text.find(sub) - while index != -1: - yield index - index = text.find(sub, index + 1) - return index - - - @staticmethod - def match_ciphertext(text: str, word_pos: tuple, char: tuple): - """ - asdf - """ - word, wposl = word_pos - wpos = wposl[0] - wlen = len(word) - - snip_count = Counter() - for pos in Breaker.positions(text, char): - word_begin = pos - wpos - snippet = text[word_begin : word_begin + wlen] - - if snippet not in snip_count.elements(): - snip_count[snippet] = text.count(snippet) - - return snip_count.most_common(1)[0][0] - - @staticmethod - def choose_known_letters(key_alphabet): - letters = list(key_alphabet.keys()) - if len(key_alphabet) < 3: - yield letters - else: - for i in letters: - for j in letters: - for k in letters: - if k == j or k == i or j == i: continue - yield [i, j, k] - return None + EN_LETTER_FREQ = list("etaoinsrhdlucmfywgpbvkxqjz") def __init__(self, ciphertext, word_file): - self.ciphertext = ciphertext - self.alph = self.derive_alphabet_freq(self.get_frequency(ciphertext)) - self.word_file = word_file + most_freq_cipher = Counter(ciphertext).most_common(1)[0][0] - def get_key(self): + # excuse me: + words = [] + with open(word_file, 'r') as wf: + for line in wf: + words.append(line[:-1]) # remove trailing newline and append + + self.text = ciphertext + self.key = { Breaker.EN_LETTER_FREQ[0] : most_freq_cipher } + self.words = words + + def choose_word(self): + known_chars = self.key.keys() + for i in range(len(self.words)): + word = self.words[i] - key_alphabet = OrderedDict() + if len(word) == 0: continue - # most frequent char in English and corresponding most common char in text - most_freq = self.alph[Breaker.EN_LETTER_FREQ] + # count known chars in word + n = 0 + for char in word: + if char in known_chars: + n +=1 - key_alphabet[Breaker.EN_LETTER_FREQ[0]] = most_freq + # skip known words or words with too many unknown + if n == len(word) or n / len(word) < 0.3: + continue - while len(key_alphabet) < 26: + return self.words[i] - word_pos = Breaker.get_word_containing( - self.word_file, - next(Breaker.choose_known_letters(key_alphabet)) - ) + def translate_and_regex(self, word: str): + regex = "" + for char in word: + if char in self.key.keys(): + regex = regex + self.key[char] + else: + regex = regex + "." + return regex + + def match_ciphertext(self, regex): + rx = re.compile(regex) + count = Counter(re.findall(self.text)) + return count.most_common(1)[0][0] + + def extract_unknown(self, plain, regex, cipher): + assert len(plain) == len(regex) == len(cipher) + + for i in range(len(regex)): + if regex[i] != ".": continue + self.key.update({ plain[i] : cipher[i] }) + + return self.key + + def key_to_str(self): + return str(self.key.keys()) + + def get_key(self): - pass + while len(self.key.keys()) < 26: - most_common = Breaker.match_ciphertext( - self.ciphertext, - word_pos, - most_freq - ) + word = self.choose_word() + regex = self.translate_and_regex(word) + cipher = self.match_ciphertext(regex) + print(word, regex, cipher, self.extract_unknown()) - pass + return self.key_to_str() ## end Breaker From afb6faabbc486833a4ebbd091d7abb2f29712df2 Mon Sep 17 00:00:00 2001 From: Daniel Tschertkow Date: Wed, 25 Nov 2020 02:15:30 +0100 Subject: [PATCH 6/7] =?UTF-8?q?f=C3=BCgt=20diverse=20Bugfixes=20hinzu?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mono/break_mono.py | 57 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/src/mono/break_mono.py b/src/mono/break_mono.py index 5cb2840..e8b7aaa 100644 --- a/src/mono/break_mono.py +++ b/src/mono/break_mono.py @@ -7,34 +7,45 @@ Python module to derive a key from an monoalphabetically encrypted file. import re from collections import Counter +#from string import ascii_lowercase - +import pdb class Breaker(): - EN_LETTER_FREQ = list("etaoinsrhdlucmfywgpbvkxqjz") - def __init__(self, ciphertext, word_file): - - most_freq_cipher = Counter(ciphertext).most_common(1)[0][0] - - # excuse me: + @staticmethod + def read_word_file(word_file): + # excuse me words = [] with open(word_file, 'r') as wf: for line in wf: - words.append(line[:-1]) # remove trailing newline and append + word = line[:-1] # remove trailing newline + word = word.lower() + + if word == "" or not word.isalpha(): + continue + + words.append(word) + return words + + def __init__(self, ciphertext, word_file): + + most_freq_cipher = Counter(ciphertext).most_common(1)[0][0] self.text = ciphertext self.key = { Breaker.EN_LETTER_FREQ[0] : most_freq_cipher } - self.words = words + self.words = Breaker.read_word_file(word_file) def choose_word(self): known_chars = self.key.keys() for i in range(len(self.words)): word = self.words[i] - if len(word) == 0: continue + if len(word) == 0: + self.words.pop(i) + continue # count known chars in word n = 0 @@ -42,11 +53,15 @@ class Breaker(): if char in known_chars: n +=1 - # skip known words or words with too many unknown - if n == len(word) or n / len(word) < 0.3: + # remove known words + if n == len(word): + self.words.pop(i) continue - return self.words[i] + # skip words with too many unknown chars + if (n / len(word) < 0.3): continue + + return self.words.pop(i) def translate_and_regex(self, word: str): regex = "" @@ -58,12 +73,16 @@ class Breaker(): return regex def match_ciphertext(self, regex): - rx = re.compile(regex) - count = Counter(re.findall(self.text)) - return count.most_common(1)[0][0] + regexc = re.compile(regex) + count = Counter(regexc.findall(self.text)) + if len(count) == 0: + return "" + else: + return count.most_common(1)[0][0] def extract_unknown(self, plain, regex, cipher): assert len(plain) == len(regex) == len(cipher) + assert "." in regex for i in range(len(regex)): if regex[i] != ".": continue @@ -81,7 +100,11 @@ class Breaker(): word = self.choose_word() regex = self.translate_and_regex(word) cipher = self.match_ciphertext(regex) - print(word, regex, cipher, self.extract_unknown()) + + if cipher == "": continue + + print(word, regex, cipher, + self.extract_unknown(word, regex, cipher)) return self.key_to_str() From ed3d03d599962bf7e09b0cd3e29a28c8eac68ce8 Mon Sep 17 00:00:00 2001 From: Daniel Tschertkow Date: Wed, 25 Nov 2020 03:55:50 +0100 Subject: [PATCH 7/7] =?UTF-8?q?F=C3=BCgt=20Dokumentation=20hinzu?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mono/break_mono.py | 91 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 75 insertions(+), 16 deletions(-) mode change 100644 => 100755 src/mono/break_mono.py diff --git a/src/mono/break_mono.py b/src/mono/break_mono.py old mode 100644 new mode 100755 index e8b7aaa..4bb44a1 --- a/src/mono/break_mono.py +++ b/src/mono/break_mono.py @@ -2,22 +2,27 @@ """ Python module to derive a key from an monoalphabetically encrypted file. +Does not work yet. """ import re - from collections import Counter -#from string import ascii_lowercase - -import pdb class Breaker(): + """ + A handle on the various bits of data needed to derive the key from the ciphertext. + """ EN_LETTER_FREQ = list("etaoinsrhdlucmfywgpbvkxqjz") @staticmethod def read_word_file(word_file): - # excuse me + """ + Helper function to read the words file into memory. The rationale was that + querying would be faster and it would be possible to delete used words + without mutating the file. + """ + words = [] with open(word_file, 'r') as wf: for line in wf: @@ -32,6 +37,7 @@ class Breaker(): def __init__(self, ciphertext, word_file): + # count chars in the ciphertext most_freq_cipher = Counter(ciphertext).most_common(1)[0][0] self.text = ciphertext @@ -39,12 +45,18 @@ class Breaker(): self.words = Breaker.read_word_file(word_file) def choose_word(self): + """ + Iterate through the word list and pick a word for pattern matching. + Words with chars that are completely known are are being removed. + Words where less then a third of the chars are known are being skipped. + """ known_chars = self.key.keys() - for i in range(len(self.words)): - word = self.words[i] + for word in self.words: + + word = word + "" # copy if len(word) == 0: - self.words.pop(i) + self.words.remove(word) continue # count known chars in word @@ -55,15 +67,22 @@ class Breaker(): # remove known words if n == len(word): - self.words.pop(i) + self.words.remove(word) continue # skip words with too many unknown chars if (n / len(word) < 0.3): continue - return self.words.pop(i) + self.words.remove(word) + return word + + return "" def translate_and_regex(self, word: str): + """ + Prepare chosen word for pattern matching. + Translate the known characters and replace the others with a regex '.' + """ regex = "" for char in word: if char in self.key.keys(): @@ -73,6 +92,11 @@ class Breaker(): return regex def match_ciphertext(self, regex): + """ + Compile the translated chosen word to a regular expression and find all + matches inside the ciphertext. Count the occurences and pick the most + frequent one. + """ regexc = re.compile(regex) count = Counter(regexc.findall(self.text)) if len(count) == 0: @@ -81,6 +105,12 @@ class Breaker(): return count.most_common(1)[0][0] def extract_unknown(self, plain, regex, cipher): + """ + Compare the the chosen words in it's various forms to infer which + new characters may be added to the alphabet map. + The dots inside `regex' symbolize the positions of unknown characters + and provide a mapping between `plain' and `cipher' text. + """ assert len(plain) == len(regex) == len(cipher) assert "." in regex @@ -91,21 +121,50 @@ class Breaker(): return self.key def key_to_str(self): - return str(self.key.keys()) + return "".join(self.key.keys()) def get_key(self): - + """ + Strings everything together. + Unfortunatly it does not work. + """ while len(self.key.keys()) < 26: word = self.choose_word() + + if word == "": break # no more words with unknown chars + regex = self.translate_and_regex(word) cipher = self.match_ciphertext(regex) if cipher == "": continue - - print(word, regex, cipher, - self.extract_unknown(word, regex, cipher)) + self.extract_unknown(word, regex, cipher) return self.key_to_str() - ## end Breaker + +if __name__ == "__main__": + import sys, os + import argparse + + # cannot import from a parent package if called directly + # without modifying PYTHONPATH or sys.path + file_dir = os.path.dirname(os.path.abspath(__file__)) + file_parent_dir = os.path.dirname(file_dir) + sys.path.append(file_parent_dir) + + from libex01 import read_text + + def parse_args(sys_argv): + parser = argparse.ArgumentParser() + parser.add_argument("FILE") + return parser.parse_args(sys_argv[1:]) + + + args = parse_args(sys.argv) + txt = read_text(args.FILE) + word_file = "common.txt" + + bm = Breaker(txt, word_file) + + print(bm.get_key())