Merge branch 'break_mono'
commit
ecb3157a91
@ -0,0 +1,170 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
"""
|
||||
Python module to derive a key from an monoalphabetically encrypted file.
|
||||
Does not work yet.
|
||||
"""
|
||||
|
||||
import re
|
||||
from collections import Counter
|
||||
|
||||
class Breaker():
|
||||
"""
|
||||
A handle on the various bits of data needed to derive the key from the ciphertext.
|
||||
"""
|
||||
|
||||
EN_LETTER_FREQ = list("etaoinsrhdlucmfywgpbvkxqjz")
|
||||
|
||||
@staticmethod
|
||||
def read_word_file(word_file):
|
||||
"""
|
||||
Helper function to read the words file into memory. The rationale was that
|
||||
querying would be faster and it would be possible to delete used words
|
||||
without mutating the file.
|
||||
"""
|
||||
|
||||
words = []
|
||||
with open(word_file, 'r') as wf:
|
||||
for line in wf:
|
||||
word = line[:-1] # remove trailing newline
|
||||
word = word.lower()
|
||||
|
||||
if word == "" or not word.isalpha():
|
||||
continue
|
||||
|
||||
words.append(word)
|
||||
return words
|
||||
|
||||
def __init__(self, ciphertext, word_file):
|
||||
|
||||
# count chars in the ciphertext
|
||||
most_freq_cipher = Counter(ciphertext).most_common(1)[0][0]
|
||||
|
||||
self.text = ciphertext
|
||||
self.key = { Breaker.EN_LETTER_FREQ[0] : most_freq_cipher }
|
||||
self.words = Breaker.read_word_file(word_file)
|
||||
|
||||
def choose_word(self):
|
||||
"""
|
||||
Iterate through the word list and pick a word for pattern matching.
|
||||
Words with chars that are completely known are are being removed.
|
||||
Words where less then a third of the chars are known are being skipped.
|
||||
"""
|
||||
known_chars = self.key.keys()
|
||||
for word in self.words:
|
||||
|
||||
word = word + "" # copy
|
||||
|
||||
if len(word) == 0:
|
||||
self.words.remove(word)
|
||||
continue
|
||||
|
||||
# count known chars in word
|
||||
n = 0
|
||||
for char in word:
|
||||
if char in known_chars:
|
||||
n +=1
|
||||
|
||||
# remove known words
|
||||
if n == len(word):
|
||||
self.words.remove(word)
|
||||
continue
|
||||
|
||||
# skip words with too many unknown chars
|
||||
if (n / len(word) < 0.3): continue
|
||||
|
||||
self.words.remove(word)
|
||||
return word
|
||||
|
||||
return ""
|
||||
|
||||
def translate_and_regex(self, word: str):
|
||||
"""
|
||||
Prepare chosen word for pattern matching.
|
||||
Translate the known characters and replace the others with a regex '.'
|
||||
"""
|
||||
regex = ""
|
||||
for char in word:
|
||||
if char in self.key.keys():
|
||||
regex = regex + self.key[char]
|
||||
else:
|
||||
regex = regex + "."
|
||||
return regex
|
||||
|
||||
def match_ciphertext(self, regex):
|
||||
"""
|
||||
Compile the translated chosen word to a regular expression and find all
|
||||
matches inside the ciphertext. Count the occurences and pick the most
|
||||
frequent one.
|
||||
"""
|
||||
regexc = re.compile(regex)
|
||||
count = Counter(regexc.findall(self.text))
|
||||
if len(count) == 0:
|
||||
return ""
|
||||
else:
|
||||
return count.most_common(1)[0][0]
|
||||
|
||||
def extract_unknown(self, plain, regex, cipher):
|
||||
"""
|
||||
Compare the the chosen words in it's various forms to infer which
|
||||
new characters may be added to the alphabet map.
|
||||
The dots inside `regex' symbolize the positions of unknown characters
|
||||
and provide a mapping between `plain' and `cipher' text.
|
||||
"""
|
||||
assert len(plain) == len(regex) == len(cipher)
|
||||
assert "." in regex
|
||||
|
||||
for i in range(len(regex)):
|
||||
if regex[i] != ".": continue
|
||||
self.key.update({ plain[i] : cipher[i] })
|
||||
|
||||
return self.key
|
||||
|
||||
def key_to_str(self):
|
||||
return "".join(self.key.keys())
|
||||
|
||||
def get_key(self):
|
||||
"""
|
||||
Strings everything together.
|
||||
Unfortunatly it does not work.
|
||||
"""
|
||||
while len(self.key.keys()) < 26:
|
||||
|
||||
word = self.choose_word()
|
||||
|
||||
if word == "": break # no more words with unknown chars
|
||||
|
||||
regex = self.translate_and_regex(word)
|
||||
cipher = self.match_ciphertext(regex)
|
||||
|
||||
if cipher == "": continue
|
||||
self.extract_unknown(word, regex, cipher)
|
||||
|
||||
return self.key_to_str()
|
||||
## end Breaker
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys, os
|
||||
import argparse
|
||||
|
||||
# cannot import from a parent package if called directly
|
||||
# without modifying PYTHONPATH or sys.path
|
||||
file_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
file_parent_dir = os.path.dirname(file_dir)
|
||||
sys.path.append(file_parent_dir)
|
||||
|
||||
from libex01 import read_text
|
||||
|
||||
def parse_args(sys_argv):
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("FILE")
|
||||
return parser.parse_args(sys_argv[1:])
|
||||
|
||||
|
||||
args = parse_args(sys.argv)
|
||||
txt = read_text(args.FILE)
|
||||
word_file = "common.txt"
|
||||
|
||||
bm = Breaker(txt, word_file)
|
||||
|
||||
print(bm.get_key())
|
||||
Loading…
Reference in New Issue