ayon-core/pype/tests/test_mongo_performance.py

import pymongo
import bson
import random
from datetime import datetime
import os


class TestPerformance():
    '''
        Class for testing performance of representation and their 'files'
        parts.
        Discussion is if embedded array:
                            'files' : [ {'_id': '1111', 'path':'....},
                                        {'_id'...}]
                     OR documents:
                            'files' : {
                                            '1111': {'path':'....'},
                                            '2222': {'path':'...'}
                                        }
                     is faster.

        Current results:
            without additional partial index documents is 3x faster
            With index is array 50x faster then document

        Partial index something like:
        db.getCollection('performance_test').createIndex
            ({'files._id': 1},
            {partialFilterExpresion: {'files': {'$exists': true}}})
        !DIDNT work for me, had to create manually in Compass

    '''

    MONGO_URL = 'mongodb://localhost:27017'
    MONGO_DB = 'performance_test'
    MONGO_COLLECTION = 'performance_test'

    MAX_FILE_SIZE_B = 5000
    MAX_NUMBER_OF_SITES = 50
    ROOT_DIR = "C:/projects"

    inserted_ids = []

    def __init__(self, version='array'):
        '''
            It creates and fills collection, based on value of 'version'.

        :param version: 'array' - files as embedded array,
                        'doc' - as document
        '''
        self.client = pymongo.MongoClient(self.MONGO_URL)
        self.db = self.client[self.MONGO_DB]
        self.collection_name = self.MONGO_COLLECTION

        self.version = version

        if self.version != 'array':
            self.collection_name = self.MONGO_COLLECTION + '_doc'

        self.collection = self.db[self.collection_name]

        self.ids = []  # for testing
        self.inserted_ids = []

    def prepare(self, no_of_records=100000, create_files=False):
        '''
            Produce 'no_of_records' of representations with 'files' segment.
            It depends on 'version' value in constructor, 'arrray' or 'doc'
        :return:
        '''
        print('Purging {} collection'.format(self.collection_name))
        self.collection.delete_many({})

        id = bson.objectid.ObjectId()

        insert_recs = []
        for i in range(no_of_records):
            file_id = bson.objectid.ObjectId()
            file_id2 = bson.objectid.ObjectId()
            file_id3 = bson.objectid.ObjectId()

            self.inserted_ids.extend([file_id, file_id2, file_id3])
            version_str = "v{0:03}".format(i + 1)
            file_name = "test_Cylinder_workfileLookdev_{}.mb".\
                format(version_str)

            document = {"files": self.get_files(self.version, i + 1,
                                                file_id, file_id2, file_id3,
                                                create_files)
                        ,
                        "context": {
                            "subset": "workfileLookdev",
                            "username": "petrk",
                            "task": "lookdev",
                            "family": "workfile",
                            "hierarchy": "Assets",
                            "project": {"code": "test", "name": "Test"},
                            "version": 1,
                            "asset": "Cylinder",
                            "representation": "mb",
                            "root": self.ROOT_DIR
                        },
                        "dependencies": [],
                        "name": "mb",
                        "parent": {"oid": '{}'.format(id)},
                        "data": {
                            "path": "C:\\projects\\Test\\Assets\\Cylinder\\publish\\workfile\\workfileLookdev\\{}\\{}".format(version_str, file_name),
                            "template": "{root}\\{project[name]}\\{hierarchy}\\{asset}\\publish\\{family}\\{subset}\\v{version:0>3}\\{project[code]}_{asset}_{subset}_v{version:0>3}<_{output}><.{frame:0>4}>.{representation}"
                        },
                        "type": "representation",
                        "schema": "pype:representation-2.0"
                        }

            insert_recs.append(document)

        print('Prepared {} records in {} collection'.
              format(no_of_records, self.collection_name))

        self.collection.insert_many(insert_recs)
        # TODO refactore to produce real array and not needeing ugly regex
        self.collection.insert_one({"inserted_id": self.inserted_ids})
        print('-' * 50)

    def run(self, queries=1000, loops=3):
        '''
            Run X'queries' that are searching collection Y'loops' times
        :param queries: how many times do ..find(...)
        :param loops:  loop of testing X queries
        :return: None
        '''
        print('Testing version {} on {}'.format(self.version,
                                                self.collection_name))
        print('Queries rung {} in {} loops'.format(queries, loops))

        inserted_ids = list(self.collection.
                            find({"inserted_id": {"$exists": True}}))
        import re
        self.ids = re.findall("'[0-9a-z]*'", str(inserted_ids))

        import time

        found_cnt = 0
        for _ in range(loops):
            print('Starting loop {}'.format(_))
            start = time.time()
            for _ in range(queries):
                # val = random.choice(self.ids)
                # val = val.replace("'", '')
                val = random.randint(0, 50)
                print(val)

                if (self.version == 'array'):
                    # prepared for partial index, without 'files': exists
                    # wont engage
                    found = self.collection.\
                        find({'files': {"$exists": True},
                              'files.sites.name': "local_{}".format(val)}).\
                        count()
                else:
                    key = "files.{}".format(val)
                    found = self.collection.find_one({key: {"$exists": True}})
                print("found {} records".format(found))
                # if found:
                #     found_cnt += len(list(found))

            end = time.time()
            print('duration per loop {}'.format(end - start))
            print("found_cnt {}".format(found_cnt))

    def get_files(self, mode, i, file_id, file_id2, file_id3,
                  create_files=False):
        '''
            Wrapper to decide if 'array' or document version should be used
        :param mode: 'array'|'doc'
        :param i: step number
        :param file_id: ObjectId of first dummy file
        :param file_id2: ..
        :param file_id3: ..
        :return:
        '''
        if mode == 'array':
            return self.get_files_array(i, file_id, file_id2, file_id3,
                                        create_files)
        else:
            return self.get_files_doc(i, file_id, file_id2, file_id3)

    def get_files_array(self, i, file_id, file_id2, file_id3,
                        create_files=False):
        ret = [
            {
                 "path": "{root}" + "/Test/Assets/Cylinder/publish/workfile/" +
                         "workfileLookdev/v{0:03}/" +
                         "test_Cylinder_A_workfileLookdev_v{0:03}.dat"
                         .format(i, i),
                 "_id": '{}'.format(file_id),
                 "hash": "temphash",
                 "sites": self.get_sites(self.MAX_NUMBER_OF_SITES),
                 "size": random.randint(0, self.MAX_FILE_SIZE_B)
            },
            {
                "path": "{root}" + "/Test/Assets/Cylinder/publish/workfile/" +
                        "workfileLookdev/v{0:03}/" +
                        "test_Cylinder_B_workfileLookdev_v{0:03}.dat"
                        .format(i, i),
                "_id": '{}'.format(file_id2),
                "hash": "temphash",
                "sites": self.get_sites(self.MAX_NUMBER_OF_SITES),
                "size": random.randint(0, self.MAX_FILE_SIZE_B)
            },
            {
                "path": "{root}" + "/Test/Assets/Cylinder/publish/workfile/" +
                        "workfileLookdev/v{0:03}/" +
                        "test_Cylinder_C_workfileLookdev_v{0:03}.dat"
                        .format(i, i),
                "_id": '{}'.format(file_id3),
                "hash": "temphash",
                "sites": self.get_sites(self.MAX_NUMBER_OF_SITES),
                "size": random.randint(0, self.MAX_FILE_SIZE_B)
            }

            ]
        if create_files:
            for f in ret:
                path = f.get("path").replace("{root}", self.ROOT_DIR)
                os.makedirs(os.path.dirname(path), exist_ok=True)
                with open(path, 'wb') as fp:
                    fp.write(os.urandom(f.get("size")))

        return ret

    def get_files_doc(self, i, file_id, file_id2, file_id3):
        ret = {}
        ret['{}'.format(file_id)] = {
            "path": "{root}" +
                    "/Test/Assets/Cylinder/publish/workfile/workfileLookdev/"
                    "v001/test_CylinderA_workfileLookdev_v{0:03}.mb".format(i),
            "hash": "temphash",
            "sites": ["studio"],
            "size": 87236
        }

        ret['{}'.format(file_id2)] = {
            "path": "{root}" +
                    "/Test/Assets/Cylinder/publish/workfile/workfileLookdev/"
                    "v001/test_CylinderB_workfileLookdev_v{0:03}.mb".format(i),
            "hash": "temphash",
            "sites": ["studio"],
            "size": 87236
        }
        ret['{}'.format(file_id3)] = {
            "path": "{root}" +
                    "/Test/Assets/Cylinder/publish/workfile/workfileLookdev/"
                    "v001/test_CylinderC_workfileLookdev_v{0:03}.mb".format(i),
            "hash": "temphash",
            "sites": ["studio"],
            "size": 87236
        }

        return ret

    def get_sites(self, number_of_sites=50):
        """
            Return array of sites declaration.
            Currently on 1st site has "created_dt" fillled, which should
            trigger upload to 'gdrive' site.
            'gdrive' site is appended, its destination for syncing for
            Sync Server
        Args:
            number_of_sites:

        Returns:

        """
        sites = []
        for i in range(number_of_sites):
            site = {'name': "local_{}".format(i)}
            # do not create null 'created_dt' field, Mongo doesnt like it
            if i == 0:
                site['created_dt'] = datetime.now()

            sites.append(site)

        sites.append({'name': "gdrive"})

        return sites


if __name__ == '__main__':
    tp = TestPerformance('array')
    tp.prepare(no_of_records=10, create_files=True)  # enable to prepare data
    # tp.run(10, 3)

    # print('-'*50)
    #
    # tp = TestPerformance('doc')
    # tp.prepare()  # enable to prepare data
    # tp.run(1000, 3)