From 5ea9a8dd752c7fab4e7373c37ee79e6eaac52ffb Mon Sep 17 00:00:00 2001
From: Leonard Kugis <leonard@kug.is>
Date: Mon, 22 Jul 2024 03:58:58 +0200
Subject: Initial commit

---
 .gitignore       | 270 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 __init__.py      |   0
 yara-compiler.py |  62 +++++++++++++
 yara.py          | 246 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 578 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 __init__.py
 create mode 100644 yara-compiler.py
 create mode 100644 yara.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..97f2a16
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,270 @@
+yara
+# Created by https://www.toptal.com/developers/gitignore/api/python,windows,linux,macos,visualstudiocode
+# Edit at https://www.toptal.com/developers/gitignore?templates=python,windows,linux,macos,visualstudiocode
+
+### Linux ###
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+### macOS Patch ###
+# iCloud generated files
+*.icloud
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+
+# ruff
+.ruff_cache/
+
+# LSP config files
+pyrightconfig.json
+
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+
+### Windows ###
+# Windows thumbnail cache files
+Thumbs.db
+Thumbs.db:encryptable
+ehthumbs.db
+ehthumbs_vista.db
+
+# Dump file
+*.stackdump
+
+# Folder config file
+[Dd]esktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Windows Installer files
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+
+# Windows shortcuts
+*.lnk
+
+# End of https://www.toptal.com/developers/gitignore/api/python,windows,linux,macos,visualstudiocode
\ No newline at end of file
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/yara-compiler.py b/yara-compiler.py
new file mode 100644
index 0000000..4e0c77d
--- /dev/null
+++ b/yara-compiler.py
@@ -0,0 +1,62 @@
+import argparse
+import os
+import re
+from yara import *
+
+def dir_path(string):
+    if os.path.isdir(string) or re.match(r"(^\/|^\.\/|^\.\.\/|^[^/])[^:*?\"<>|\r\n]*\.yac$", string):
+        return string
+    else:
+        raise Error(string)
+
+def walk(args):
+    logger = logging.getLogger(__name__)
+    logger.info("Walking files ...")
+
+    files = [os.path.abspath(os.path.join(dp, f)) for dp, dn, filenames in os.walk(args["input_directory"]) for f in filenames]
+    logger.debug("Files: {}".format(files))
+    logger.info("Number of files found: {}".format(len(files)))
+
+    if args["output"].endswith(".yac"):
+        yd = YaraDatabase()
+        for file in files:
+            if file.endswith(".json"):
+                logger.info("Compiling file {}".format(file))
+                yd.add_file(file)
+        yd.write_file(args["output"])
+    else:
+        for file in files:
+            if file.endswith(".json"):
+                logger.info("Compiling file {}".format(file))
+                yd = YaraDatabase()
+                yd.add_file(file)
+                yd.write_file(os.path.join(args["output"], os.path.splitext(os.path.basename(file))[0] + ".yac"))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Compile single or multiple yara files')
+    parser.add_argument('-i', '--input-directory', nargs='?', default='.', type=dir_path, help='Input directory (default: %(default)s)')
+    parser.add_argument('-o', '--output', nargs='?', default='.', type=dir_path, help='Output file or directory (default: %(default)s)')
+    parser.add_argument('-f', '--input-file', nargs='?', default='.', type=dir_path, help='Input file (default: %(default)s)')
+    parser.add_argument('-v', '--verbose', action="count", default=0, help="Verbosity level")
+    args = parser.parse_args()
+
+    if args.verbose == 0:
+        log_level = logging.WARNING
+    elif args.verbose == 1:
+        log_level = logging.INFO
+    elif args.verbose >= 2:
+        log_level = logging.DEBUG
+
+    logging.basicConfig(stream=sys.stdout, level=log_level)
+    logger = logging.getLogger(__name__)
+
+    args = {
+        "input_directory": args.input_directory,
+        "output": args.output,
+        "input_file": args.input_file,
+        "verbosity": args.verbose
+    }
+
+    logger.debug("args = {}".format(args))
+
+    walk(args)
\ No newline at end of file
diff --git a/yara.py b/yara.py
new file mode 100644
index 0000000..9252c68
--- /dev/null
+++ b/yara.py
@@ -0,0 +1,246 @@
+import json
+import sys
+import os
+import struct
+import re
+import logging
+
+logger = logging.getLogger(__name__)
+
+class OperatorTree(object):
+    def __init__(self):
+        self.left = None
+        self.right = None
+        self.parent = None
+        self.operator = None
+        self.data = None
+
+class OperatorOf(object):
+    def __init__(self, parent, n, pattern):
+        self.parent = parent
+        self.n = n
+        self.pattern = pattern
+
+class YaraDatabase(object):
+
+    __FORMAT_HEADER = "=3sccI"
+    __FORMAT_ENTRY = "=c{size_id}sc"
+    __FORMAT_STRING = "=c{size_id}scH{size_text}scII"
+    __FORMAT_WILDCARD = "=Ic"
+    __FORMAT_RANGE = "=II"
+    __FORMAT_OPERATOR = "=c"
+    __FORMAT_OPERATOR_OF = "=cc"
+    __FORMAT_OPERATOR_OF_ELEMENT = "=c"
+    __FORMAT_OPERATOR_SINGLE = "=c"
+
+    __STRING_TYPE_STRING = 0
+    __STRING_TYPE_HEX = 1
+    __STRING_TYPE_REGEX = 2
+
+    __PATTERN_RANGE_VARIABLE = re.compile(r"^\[(\d+)-(\d+)\]$")
+    __PATTERN_RANGE_FIXED = re.compile(r"^\[(\d+)\]$")
+    __PATTERN_WILDCARD_HIGH = re.compile(r"^\?[0-9A-Fa-f]$")
+    __PATTERN_WILDCARD_LOW = re.compile(r"^[0-9A-Fa-f]\?$")
+    __PATTERN_WILDCARD_BOTH = re.compile(r"^\?\?$")
+    __PATTERN_OF = re.compile(r"((\d+)|(all)|(any))\s+of\s+([\w\_\(\)\$\*\,]+)")
+    __PATTERN_AND = re.compile(r"(.*)\s+and\s+(.*)")
+    __PATTERN_OR = re.compile(r"(.*)\s+or\s+(.*)")
+
+    __CONDITION_OPERATOR_OR = 0
+    __CONDITION_OPERATOR_AND = 1
+    __CONDITION_OPERATOR_OF = 2
+    __CONDITION_OPERATOR_SINGLE = 3
+    __CONDITION_OPERATOR_TRUE = 4
+    __CONDITION_OPERATOR_FALSE = 5
+
+    @staticmethod
+    def parse_file(file):
+        container = json.load(file)
+        entries = list()
+        entries.extend(container["rules"])
+        return entries
+
+    @staticmethod
+    def build_tree(condition, parent):
+        node = OperatorTree()
+        node.data = condition
+        logger.debug("Parsing condition = {}".format(condition))
+        match = re.findall(YaraDatabase.__PATTERN_OR, condition)
+        if match:
+            node.left = YaraDatabase.build_tree(match[0][0], node)
+            node.right = YaraDatabase.build_tree(match[0][1], node)
+            node.operator = YaraDatabase.__CONDITION_OPERATOR_OR
+            return node
+        match = re.findall(YaraDatabase.__PATTERN_AND, condition)
+        if match:
+            node.left = YaraDatabase.build_tree(match[0][0], node)
+            node.right = YaraDatabase.build_tree(match[0][1], node)
+            node.operator = YaraDatabase.__CONDITION_OPERATOR_AND
+            return node
+        match = re.findall(YaraDatabase.__PATTERN_OF, condition)
+        if match:
+            logger.debug("Leaf: OperatorOf, match = {}, n = {}, pattern = {}".format(match, match[0][0], match[0][4]))
+            return OperatorOf(parent, match[0][0], match[0][4])
+        logger.debug("Leaf: remainder = {}".format(condition))
+        return condition
+
+    @staticmethod
+    def compile_tree(node, strings):
+        if isinstance(node, OperatorTree):
+            data_left = YaraDatabase.compile_tree(node.left, strings)
+            data_right = YaraDatabase.compile_tree(node.right, strings)
+            logger.debug("Compiling OperatorTree, left = {}, right = {}".format(data_left, data_right))
+            data_left += data_right
+            data_left += struct.pack(YaraDatabase.__FORMAT_OPERATOR, node.operator.to_bytes(1))
+            return data_left
+        elif isinstance(node, OperatorOf):
+            logger.debug("Compiling OperatorOf, n = {}, pattern = {}".format(node.n, node.pattern))
+            data = bytearray()
+            data += struct.pack(YaraDatabase.__FORMAT_OPERATOR, YaraDatabase.__CONDITION_OPERATOR_OF.to_bytes(1))
+            of_elements = list()
+            pattern = str()
+            if node.pattern.strip() == "them":
+                pattern = r".*"
+            else:
+                para = 0
+                for c in node.pattern.strip():
+                    if c == '$':
+                        pattern += r"\$"
+                    elif c == '*':
+                        pattern += r".*"
+                    elif c == ',':
+                        pattern += ")|("
+                    elif c == ' ':
+                        pass
+                    elif c == '(':
+                        pattern += "("
+                        para += 1
+                    elif c == ')':
+                        if para == 0:
+                            logger.warning("Unmatched paranthesis in pattern {}".format(node.pattern))
+                        else:
+                            pattern += ")"
+                            para -= 1
+                    else:
+                        pattern += c
+            logger.debug("Patched pattern = {}".format(pattern))
+            pattern = re.compile(pattern)
+            c = 0
+            for s in strings:
+                if re.match(pattern, s):
+                    of_elements.append(c)
+                c += 1
+            n = node.n
+            if n == "all":
+                n = 0
+            if n == "any":
+                n = 1
+            data += struct.pack(YaraDatabase.__FORMAT_OPERATOR_OF, int(n).to_bytes(1), len(of_elements).to_bytes(1))
+            for e in of_elements:
+                data += struct.pack(YaraDatabase.__FORMAT_OPERATOR_OF_ELEMENT, e.to_bytes(1))
+            return data
+        else:
+            logger.debug("Compiling single identifier {}".format(node))
+            data = bytearray(struct.pack(YaraDatabase.__FORMAT_OPERATOR, YaraDatabase.__CONDITION_OPERATOR_SINGLE.to_bytes(1)))
+            c = 0
+            for s in strings:
+                if s == node:
+                    data += struct.pack(YaraDatabase.__FORMAT_OPERATOR_SINGLE, c.to_bytes(1))
+                    return data
+                c += 1
+            else:
+                logger.warning("Single identifier {} not found, defaulting to true".format(node))
+                return bytearray(struct.pack(YaraDatabase.__FORMAT_OPERATOR, YaraDatabase.__CONDITION_OPERATOR_TRUE.to_bytes(1)))
+
+    def add_file(self, filename):
+        f = open(filename, 'r')
+        self.__entries.extend(YaraDatabase.parse_file(f))
+        f.close()
+
+    def write_file(self, filename):
+        f = open(filename, 'wb')
+        header = struct.pack(self.__FORMAT_HEADER, "YAC".encode("utf-8"), b'\x00', b'\x00', len(self.__entries))
+        logger.debug("Header data = {}".format(header))
+        f.write(header)
+        for entry in self.__entries:
+            logger.debug("Compiling entry {}".format(entry["identifier"]))
+            entry_data = bytearray(struct.pack(self.__FORMAT_ENTRY.format(size_id=len(entry["identifier"])), len(entry["identifier"]).to_bytes(1), entry["identifier"].encode("utf-8"), len(entry["strings"]).to_bytes(1)))
+            logger.debug("Entry data = {}".format(entry_data))
+            string_data = bytearray()
+            for s in entry["strings"]:
+                logger.debug("Compiling string {}".format(s["id"]))
+                # first parse text
+                text = bytearray()
+                wildcards = list()
+                ranges = list()
+                if s["type"] == YaraDatabase.__STRING_TYPE_STRING:
+                    logger.debug("String type string, text = {}".format(s["text"]))
+                    text += s["text"].encode("utf-8")
+                elif s["type"] == YaraDatabase.__STRING_TYPE_HEX:
+                    bn = 0
+                    for block in s["text"].strip().split(' '):
+                        logger.debug("Compiling block = {}".format(block))
+                        match = re.match(self.__PATTERN_RANGE_VARIABLE, block)
+                        if match:
+                            for i in range(int(match.group(1)), int(match.group(2)), 1):
+                                logger.debug("Appending range = {}".format((bn, i)))
+                                ranges.append((bn, i))
+                            bn += 1
+                            continue
+                        match = re.match(self.__PATTERN_RANGE_FIXED, block)
+                        if match:
+                            logger.debug("Appending range = {}".format((bn, int(match.group(1)))))
+                            ranges.append((bn, int(match.group(1))))
+                            bn += 1
+                            continue
+                        if re.match(self.__PATTERN_WILDCARD_HIGH, block):
+                            wildcards.append((bn, 1))
+                            block = block.replace('?', '0')
+                            text += bytearray.fromhex(block)
+                            bn += 1
+                            continue
+                        if re.match(self.__PATTERN_WILDCARD_LOW, block):
+                            wildcards.append((bn, 0))
+                            block = block.replace('?', '0')
+                            text += bytearray.fromhex(block)
+                            bn += 1
+                            continue
+                        if re.match(self.__PATTERN_WILDCARD_BOTH, block):
+                            wildcards.append((bn, 0))
+                            wildcards.append((bn, 1))
+                            block = block.replace('?', '0')
+                            text += bytearray.fromhex(block)
+                            bn += 1
+                            continue
+                        text += bytearray.fromhex(block)
+                        bn += 1
+                        continue
+                elif s["type"] == YaraDatabase.__STRING_TYPE_REGEX:
+                    text += s["text"].encode("utf-8")
+                # parse modifiers
+                modifiers = (((1 if s["modifiers"]["nocase"] else 0) << 6) |
+                    ((1 if s["modifiers"]["ascii"] else 0) << 5) |
+                    ((1 if s["modifiers"]["wide"] else 0) << 4) |
+                    ((1 if s["modifiers"]["fullword"] else 0) << 3) |
+                    ((1 if s["modifiers"]["private"] else 0) << 2) |
+                    ((1 if s["modifiers"]["i"] else 0) << 1) |
+                    ((1 if s["modifiers"]["s"] else 0) << 0))
+                string_data += struct.pack(self.__FORMAT_STRING.format(size_id=len(s["id"]), size_text=len(text)), len(s["id"]).to_bytes(1), s["id"].encode("utf-8"), s["type"].to_bytes(1), len(text), text, modifiers.to_bytes(1), len(wildcards), len(ranges))
+                for wildcard in wildcards:
+                    string_data += struct.pack(self.__FORMAT_WILDCARD, wildcard[0], wildcard[1].to_bytes(1))
+                for r in ranges:
+                    string_data += struct.pack(self.__FORMAT_RANGE, r[0], r[1])
+            logger.debug("Building conditional operator tree for entry = {}, condition = {}".format(entry["identifier"], entry["condition"]))
+            node = YaraDatabase.build_tree(entry["condition"], None)
+            logger.debug("Compiling conditional operator tree for entry = {}, condition = {}".format(entry["identifier"], entry["condition"]))
+            condition_data = YaraDatabase.compile_tree(node, [s["id"] for s in entry["strings"]])
+            logger.debug("Compilation done for entry {}".format(entry["identifier"]))
+            f.write(entry_data)
+            f.write(string_data)
+            f.write(condition_data)
+        logger.debug("Compilation done for file {}".format(filename))
+        f.close()
+
+    def __init__(self):
+        self.__entries = list()
+        pass
\ No newline at end of file
-- 
cgit v1.2.1