Merge branch 'break_mono'
commit
ecb3157a91
@ -0,0 +1,170 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
"""
|
||||||
|
Python module to derive a key from an monoalphabetically encrypted file.
|
||||||
|
Does not work yet.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
class Breaker():
|
||||||
|
"""
|
||||||
|
A handle on the various bits of data needed to derive the key from the ciphertext.
|
||||||
|
"""
|
||||||
|
|
||||||
|
EN_LETTER_FREQ = list("etaoinsrhdlucmfywgpbvkxqjz")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def read_word_file(word_file):
|
||||||
|
"""
|
||||||
|
Helper function to read the words file into memory. The rationale was that
|
||||||
|
querying would be faster and it would be possible to delete used words
|
||||||
|
without mutating the file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
words = []
|
||||||
|
with open(word_file, 'r') as wf:
|
||||||
|
for line in wf:
|
||||||
|
word = line[:-1] # remove trailing newline
|
||||||
|
word = word.lower()
|
||||||
|
|
||||||
|
if word == "" or not word.isalpha():
|
||||||
|
continue
|
||||||
|
|
||||||
|
words.append(word)
|
||||||
|
return words
|
||||||
|
|
||||||
|
def __init__(self, ciphertext, word_file):
|
||||||
|
|
||||||
|
# count chars in the ciphertext
|
||||||
|
most_freq_cipher = Counter(ciphertext).most_common(1)[0][0]
|
||||||
|
|
||||||
|
self.text = ciphertext
|
||||||
|
self.key = { Breaker.EN_LETTER_FREQ[0] : most_freq_cipher }
|
||||||
|
self.words = Breaker.read_word_file(word_file)
|
||||||
|
|
||||||
|
def choose_word(self):
|
||||||
|
"""
|
||||||
|
Iterate through the word list and pick a word for pattern matching.
|
||||||
|
Words with chars that are completely known are are being removed.
|
||||||
|
Words where less then a third of the chars are known are being skipped.
|
||||||
|
"""
|
||||||
|
known_chars = self.key.keys()
|
||||||
|
for word in self.words:
|
||||||
|
|
||||||
|
word = word + "" # copy
|
||||||
|
|
||||||
|
if len(word) == 0:
|
||||||
|
self.words.remove(word)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# count known chars in word
|
||||||
|
n = 0
|
||||||
|
for char in word:
|
||||||
|
if char in known_chars:
|
||||||
|
n +=1
|
||||||
|
|
||||||
|
# remove known words
|
||||||
|
if n == len(word):
|
||||||
|
self.words.remove(word)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# skip words with too many unknown chars
|
||||||
|
if (n / len(word) < 0.3): continue
|
||||||
|
|
||||||
|
self.words.remove(word)
|
||||||
|
return word
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def translate_and_regex(self, word: str):
|
||||||
|
"""
|
||||||
|
Prepare chosen word for pattern matching.
|
||||||
|
Translate the known characters and replace the others with a regex '.'
|
||||||
|
"""
|
||||||
|
regex = ""
|
||||||
|
for char in word:
|
||||||
|
if char in self.key.keys():
|
||||||
|
regex = regex + self.key[char]
|
||||||
|
else:
|
||||||
|
regex = regex + "."
|
||||||
|
return regex
|
||||||
|
|
||||||
|
def match_ciphertext(self, regex):
|
||||||
|
"""
|
||||||
|
Compile the translated chosen word to a regular expression and find all
|
||||||
|
matches inside the ciphertext. Count the occurences and pick the most
|
||||||
|
frequent one.
|
||||||
|
"""
|
||||||
|
regexc = re.compile(regex)
|
||||||
|
count = Counter(regexc.findall(self.text))
|
||||||
|
if len(count) == 0:
|
||||||
|
return ""
|
||||||
|
else:
|
||||||
|
return count.most_common(1)[0][0]
|
||||||
|
|
||||||
|
def extract_unknown(self, plain, regex, cipher):
|
||||||
|
"""
|
||||||
|
Compare the the chosen words in it's various forms to infer which
|
||||||
|
new characters may be added to the alphabet map.
|
||||||
|
The dots inside `regex' symbolize the positions of unknown characters
|
||||||
|
and provide a mapping between `plain' and `cipher' text.
|
||||||
|
"""
|
||||||
|
assert len(plain) == len(regex) == len(cipher)
|
||||||
|
assert "." in regex
|
||||||
|
|
||||||
|
for i in range(len(regex)):
|
||||||
|
if regex[i] != ".": continue
|
||||||
|
self.key.update({ plain[i] : cipher[i] })
|
||||||
|
|
||||||
|
return self.key
|
||||||
|
|
||||||
|
def key_to_str(self):
|
||||||
|
return "".join(self.key.keys())
|
||||||
|
|
||||||
|
def get_key(self):
|
||||||
|
"""
|
||||||
|
Strings everything together.
|
||||||
|
Unfortunatly it does not work.
|
||||||
|
"""
|
||||||
|
while len(self.key.keys()) < 26:
|
||||||
|
|
||||||
|
word = self.choose_word()
|
||||||
|
|
||||||
|
if word == "": break # no more words with unknown chars
|
||||||
|
|
||||||
|
regex = self.translate_and_regex(word)
|
||||||
|
cipher = self.match_ciphertext(regex)
|
||||||
|
|
||||||
|
if cipher == "": continue
|
||||||
|
self.extract_unknown(word, regex, cipher)
|
||||||
|
|
||||||
|
return self.key_to_str()
|
||||||
|
## end Breaker
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys, os
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
# cannot import from a parent package if called directly
|
||||||
|
# without modifying PYTHONPATH or sys.path
|
||||||
|
file_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
file_parent_dir = os.path.dirname(file_dir)
|
||||||
|
sys.path.append(file_parent_dir)
|
||||||
|
|
||||||
|
from libex01 import read_text
|
||||||
|
|
||||||
|
def parse_args(sys_argv):
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("FILE")
|
||||||
|
return parser.parse_args(sys_argv[1:])
|
||||||
|
|
||||||
|
|
||||||
|
args = parse_args(sys.argv)
|
||||||
|
txt = read_text(args.FILE)
|
||||||
|
word_file = "common.txt"
|
||||||
|
|
||||||
|
bm = Breaker(txt, word_file)
|
||||||
|
|
||||||
|
print(bm.get_key())
|
||||||
Loading…
Reference in New Issue