From 5ea9a8dd752c7fab4e7373c37ee79e6eaac52ffb Mon Sep 17 00:00:00 2001 From: Leonard Kugis Date: Mon, 22 Jul 2024 03:58:58 +0200 Subject: Initial commit --- .gitignore | 270 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ __init__.py | 0 yara-compiler.py | 62 +++++++++++++ yara.py | 246 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 578 insertions(+) create mode 100644 .gitignore create mode 100644 __init__.py create mode 100644 yara-compiler.py create mode 100644 yara.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..97f2a16 --- /dev/null +++ b/.gitignore @@ -0,0 +1,270 @@ +yara +# Created by https://www.toptal.com/developers/gitignore/api/python,windows,linux,macos,visualstudiocode +# Edit at https://www.toptal.com/developers/gitignore?templates=python,windows,linux,macos,visualstudiocode + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### macOS Patch ### +# iCloud generated files +*.icloud + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +### Windows ### +# Windows thumbnail cache files +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msix +*.msm +*.msp + +# Windows shortcuts +*.lnk + +# End of https://www.toptal.com/developers/gitignore/api/python,windows,linux,macos,visualstudiocode \ No newline at end of file diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/yara-compiler.py b/yara-compiler.py new file mode 100644 index 0000000..4e0c77d --- /dev/null +++ b/yara-compiler.py @@ -0,0 +1,62 @@ +import argparse +import os +import re +from yara import * + +def dir_path(string): + if os.path.isdir(string) or re.match(r"(^\/|^\.\/|^\.\.\/|^[^/])[^:*?\"<>|\r\n]*\.yac$", string): + return string + else: + raise Error(string) + +def walk(args): + logger = logging.getLogger(__name__) + logger.info("Walking files ...") + + files = [os.path.abspath(os.path.join(dp, f)) for dp, dn, filenames in os.walk(args["input_directory"]) for f in filenames] + logger.debug("Files: {}".format(files)) + logger.info("Number of files found: {}".format(len(files))) + + if args["output"].endswith(".yac"): + yd = YaraDatabase() + for file in files: + if file.endswith(".json"): + logger.info("Compiling file {}".format(file)) + yd.add_file(file) + yd.write_file(args["output"]) + else: + for file in files: + if file.endswith(".json"): + logger.info("Compiling file {}".format(file)) + yd = YaraDatabase() + yd.add_file(file) + yd.write_file(os.path.join(args["output"], os.path.splitext(os.path.basename(file))[0] + ".yac")) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Compile single or multiple yara files') + parser.add_argument('-i', '--input-directory', nargs='?', default='.', type=dir_path, help='Input directory (default: %(default)s)') + parser.add_argument('-o', '--output', nargs='?', default='.', type=dir_path, help='Output file or directory (default: %(default)s)') + parser.add_argument('-f', '--input-file', nargs='?', default='.', type=dir_path, help='Input file (default: %(default)s)') + parser.add_argument('-v', '--verbose', action="count", default=0, help="Verbosity level") + args = parser.parse_args() + + if args.verbose == 0: + log_level = logging.WARNING + elif args.verbose == 1: + log_level = logging.INFO + elif args.verbose >= 2: + log_level = logging.DEBUG + + logging.basicConfig(stream=sys.stdout, level=log_level) + logger = logging.getLogger(__name__) + + args = { + "input_directory": args.input_directory, + "output": args.output, + "input_file": args.input_file, + "verbosity": args.verbose + } + + logger.debug("args = {}".format(args)) + + walk(args) \ No newline at end of file diff --git a/yara.py b/yara.py new file mode 100644 index 0000000..9252c68 --- /dev/null +++ b/yara.py @@ -0,0 +1,246 @@ +import json +import sys +import os +import struct +import re +import logging + +logger = logging.getLogger(__name__) + +class OperatorTree(object): + def __init__(self): + self.left = None + self.right = None + self.parent = None + self.operator = None + self.data = None + +class OperatorOf(object): + def __init__(self, parent, n, pattern): + self.parent = parent + self.n = n + self.pattern = pattern + +class YaraDatabase(object): + + __FORMAT_HEADER = "=3sccI" + __FORMAT_ENTRY = "=c{size_id}sc" + __FORMAT_STRING = "=c{size_id}scH{size_text}scII" + __FORMAT_WILDCARD = "=Ic" + __FORMAT_RANGE = "=II" + __FORMAT_OPERATOR = "=c" + __FORMAT_OPERATOR_OF = "=cc" + __FORMAT_OPERATOR_OF_ELEMENT = "=c" + __FORMAT_OPERATOR_SINGLE = "=c" + + __STRING_TYPE_STRING = 0 + __STRING_TYPE_HEX = 1 + __STRING_TYPE_REGEX = 2 + + __PATTERN_RANGE_VARIABLE = re.compile(r"^\[(\d+)-(\d+)\]$") + __PATTERN_RANGE_FIXED = re.compile(r"^\[(\d+)\]$") + __PATTERN_WILDCARD_HIGH = re.compile(r"^\?[0-9A-Fa-f]$") + __PATTERN_WILDCARD_LOW = re.compile(r"^[0-9A-Fa-f]\?$") + __PATTERN_WILDCARD_BOTH = re.compile(r"^\?\?$") + __PATTERN_OF = re.compile(r"((\d+)|(all)|(any))\s+of\s+([\w\_\(\)\$\*\,]+)") + __PATTERN_AND = re.compile(r"(.*)\s+and\s+(.*)") + __PATTERN_OR = re.compile(r"(.*)\s+or\s+(.*)") + + __CONDITION_OPERATOR_OR = 0 + __CONDITION_OPERATOR_AND = 1 + __CONDITION_OPERATOR_OF = 2 + __CONDITION_OPERATOR_SINGLE = 3 + __CONDITION_OPERATOR_TRUE = 4 + __CONDITION_OPERATOR_FALSE = 5 + + @staticmethod + def parse_file(file): + container = json.load(file) + entries = list() + entries.extend(container["rules"]) + return entries + + @staticmethod + def build_tree(condition, parent): + node = OperatorTree() + node.data = condition + logger.debug("Parsing condition = {}".format(condition)) + match = re.findall(YaraDatabase.__PATTERN_OR, condition) + if match: + node.left = YaraDatabase.build_tree(match[0][0], node) + node.right = YaraDatabase.build_tree(match[0][1], node) + node.operator = YaraDatabase.__CONDITION_OPERATOR_OR + return node + match = re.findall(YaraDatabase.__PATTERN_AND, condition) + if match: + node.left = YaraDatabase.build_tree(match[0][0], node) + node.right = YaraDatabase.build_tree(match[0][1], node) + node.operator = YaraDatabase.__CONDITION_OPERATOR_AND + return node + match = re.findall(YaraDatabase.__PATTERN_OF, condition) + if match: + logger.debug("Leaf: OperatorOf, match = {}, n = {}, pattern = {}".format(match, match[0][0], match[0][4])) + return OperatorOf(parent, match[0][0], match[0][4]) + logger.debug("Leaf: remainder = {}".format(condition)) + return condition + + @staticmethod + def compile_tree(node, strings): + if isinstance(node, OperatorTree): + data_left = YaraDatabase.compile_tree(node.left, strings) + data_right = YaraDatabase.compile_tree(node.right, strings) + logger.debug("Compiling OperatorTree, left = {}, right = {}".format(data_left, data_right)) + data_left += data_right + data_left += struct.pack(YaraDatabase.__FORMAT_OPERATOR, node.operator.to_bytes(1)) + return data_left + elif isinstance(node, OperatorOf): + logger.debug("Compiling OperatorOf, n = {}, pattern = {}".format(node.n, node.pattern)) + data = bytearray() + data += struct.pack(YaraDatabase.__FORMAT_OPERATOR, YaraDatabase.__CONDITION_OPERATOR_OF.to_bytes(1)) + of_elements = list() + pattern = str() + if node.pattern.strip() == "them": + pattern = r".*" + else: + para = 0 + for c in node.pattern.strip(): + if c == '$': + pattern += r"\$" + elif c == '*': + pattern += r".*" + elif c == ',': + pattern += ")|(" + elif c == ' ': + pass + elif c == '(': + pattern += "(" + para += 1 + elif c == ')': + if para == 0: + logger.warning("Unmatched paranthesis in pattern {}".format(node.pattern)) + else: + pattern += ")" + para -= 1 + else: + pattern += c + logger.debug("Patched pattern = {}".format(pattern)) + pattern = re.compile(pattern) + c = 0 + for s in strings: + if re.match(pattern, s): + of_elements.append(c) + c += 1 + n = node.n + if n == "all": + n = 0 + if n == "any": + n = 1 + data += struct.pack(YaraDatabase.__FORMAT_OPERATOR_OF, int(n).to_bytes(1), len(of_elements).to_bytes(1)) + for e in of_elements: + data += struct.pack(YaraDatabase.__FORMAT_OPERATOR_OF_ELEMENT, e.to_bytes(1)) + return data + else: + logger.debug("Compiling single identifier {}".format(node)) + data = bytearray(struct.pack(YaraDatabase.__FORMAT_OPERATOR, YaraDatabase.__CONDITION_OPERATOR_SINGLE.to_bytes(1))) + c = 0 + for s in strings: + if s == node: + data += struct.pack(YaraDatabase.__FORMAT_OPERATOR_SINGLE, c.to_bytes(1)) + return data + c += 1 + else: + logger.warning("Single identifier {} not found, defaulting to true".format(node)) + return bytearray(struct.pack(YaraDatabase.__FORMAT_OPERATOR, YaraDatabase.__CONDITION_OPERATOR_TRUE.to_bytes(1))) + + def add_file(self, filename): + f = open(filename, 'r') + self.__entries.extend(YaraDatabase.parse_file(f)) + f.close() + + def write_file(self, filename): + f = open(filename, 'wb') + header = struct.pack(self.__FORMAT_HEADER, "YAC".encode("utf-8"), b'\x00', b'\x00', len(self.__entries)) + logger.debug("Header data = {}".format(header)) + f.write(header) + for entry in self.__entries: + logger.debug("Compiling entry {}".format(entry["identifier"])) + entry_data = bytearray(struct.pack(self.__FORMAT_ENTRY.format(size_id=len(entry["identifier"])), len(entry["identifier"]).to_bytes(1), entry["identifier"].encode("utf-8"), len(entry["strings"]).to_bytes(1))) + logger.debug("Entry data = {}".format(entry_data)) + string_data = bytearray() + for s in entry["strings"]: + logger.debug("Compiling string {}".format(s["id"])) + # first parse text + text = bytearray() + wildcards = list() + ranges = list() + if s["type"] == YaraDatabase.__STRING_TYPE_STRING: + logger.debug("String type string, text = {}".format(s["text"])) + text += s["text"].encode("utf-8") + elif s["type"] == YaraDatabase.__STRING_TYPE_HEX: + bn = 0 + for block in s["text"].strip().split(' '): + logger.debug("Compiling block = {}".format(block)) + match = re.match(self.__PATTERN_RANGE_VARIABLE, block) + if match: + for i in range(int(match.group(1)), int(match.group(2)), 1): + logger.debug("Appending range = {}".format((bn, i))) + ranges.append((bn, i)) + bn += 1 + continue + match = re.match(self.__PATTERN_RANGE_FIXED, block) + if match: + logger.debug("Appending range = {}".format((bn, int(match.group(1))))) + ranges.append((bn, int(match.group(1)))) + bn += 1 + continue + if re.match(self.__PATTERN_WILDCARD_HIGH, block): + wildcards.append((bn, 1)) + block = block.replace('?', '0') + text += bytearray.fromhex(block) + bn += 1 + continue + if re.match(self.__PATTERN_WILDCARD_LOW, block): + wildcards.append((bn, 0)) + block = block.replace('?', '0') + text += bytearray.fromhex(block) + bn += 1 + continue + if re.match(self.__PATTERN_WILDCARD_BOTH, block): + wildcards.append((bn, 0)) + wildcards.append((bn, 1)) + block = block.replace('?', '0') + text += bytearray.fromhex(block) + bn += 1 + continue + text += bytearray.fromhex(block) + bn += 1 + continue + elif s["type"] == YaraDatabase.__STRING_TYPE_REGEX: + text += s["text"].encode("utf-8") + # parse modifiers + modifiers = (((1 if s["modifiers"]["nocase"] else 0) << 6) | + ((1 if s["modifiers"]["ascii"] else 0) << 5) | + ((1 if s["modifiers"]["wide"] else 0) << 4) | + ((1 if s["modifiers"]["fullword"] else 0) << 3) | + ((1 if s["modifiers"]["private"] else 0) << 2) | + ((1 if s["modifiers"]["i"] else 0) << 1) | + ((1 if s["modifiers"]["s"] else 0) << 0)) + string_data += struct.pack(self.__FORMAT_STRING.format(size_id=len(s["id"]), size_text=len(text)), len(s["id"]).to_bytes(1), s["id"].encode("utf-8"), s["type"].to_bytes(1), len(text), text, modifiers.to_bytes(1), len(wildcards), len(ranges)) + for wildcard in wildcards: + string_data += struct.pack(self.__FORMAT_WILDCARD, wildcard[0], wildcard[1].to_bytes(1)) + for r in ranges: + string_data += struct.pack(self.__FORMAT_RANGE, r[0], r[1]) + logger.debug("Building conditional operator tree for entry = {}, condition = {}".format(entry["identifier"], entry["condition"])) + node = YaraDatabase.build_tree(entry["condition"], None) + logger.debug("Compiling conditional operator tree for entry = {}, condition = {}".format(entry["identifier"], entry["condition"])) + condition_data = YaraDatabase.compile_tree(node, [s["id"] for s in entry["strings"]]) + logger.debug("Compilation done for entry {}".format(entry["identifier"])) + f.write(entry_data) + f.write(string_data) + f.write(condition_data) + logger.debug("Compilation done for file {}".format(filename)) + f.close() + + def __init__(self): + self.__entries = list() + pass \ No newline at end of file -- cgit v1.2.1