ayon-core/pype/tests/test_mongo_performance.py
Petr Kalis ef1f94aecc Hound
2020-09-22 14:48:09 +02:00

297 lines
11 KiB
Python

import pymongo
import bson
import random
from datetime import datetime
import os
class TestPerformance():
'''
Class for testing performance of representation and their 'files'
parts.
Discussion is if embedded array:
'files' : [ {'_id': '1111', 'path':'....},
{'_id'...}]
OR documents:
'files' : {
'1111': {'path':'....'},
'2222': {'path':'...'}
}
is faster.
Current results:
without additional partial index documents is 3x faster
With index is array 50x faster then document
Partial index something like:
db.getCollection('performance_test').createIndex
({'files._id': 1},
{partialFilterExpresion: {'files': {'$exists': true}}})
!DIDNT work for me, had to create manually in Compass
'''
MONGO_URL = 'mongodb://localhost:27017'
MONGO_DB = 'performance_test'
MONGO_COLLECTION = 'performance_test'
MAX_FILE_SIZE_B = 5000
MAX_NUMBER_OF_SITES = 50
ROOT_DIR = "C:/projects"
inserted_ids = []
def __init__(self, version='array'):
'''
It creates and fills collection, based on value of 'version'.
:param version: 'array' - files as embedded array,
'doc' - as document
'''
self.client = pymongo.MongoClient(self.MONGO_URL)
self.db = self.client[self.MONGO_DB]
self.collection_name = self.MONGO_COLLECTION
self.version = version
if self.version != 'array':
self.collection_name = self.MONGO_COLLECTION + '_doc'
self.collection = self.db[self.collection_name]
self.ids = [] # for testing
self.inserted_ids = []
def prepare(self, no_of_records=100000, create_files=False):
'''
Produce 'no_of_records' of representations with 'files' segment.
It depends on 'version' value in constructor, 'arrray' or 'doc'
:return:
'''
print('Purging {} collection'.format(self.collection_name))
self.collection.delete_many({})
id = bson.objectid.ObjectId()
insert_recs = []
for i in range(no_of_records):
file_id = bson.objectid.ObjectId()
file_id2 = bson.objectid.ObjectId()
file_id3 = bson.objectid.ObjectId()
self.inserted_ids.extend([file_id, file_id2, file_id3])
version_str = "v{0:03}".format(i + 1)
file_name = "test_Cylinder_workfileLookdev_{}.mb".\
format(version_str)
document = {"files": self.get_files(self.version, i + 1,
file_id, file_id2, file_id3,
create_files)
,
"context": {
"subset": "workfileLookdev",
"username": "petrk",
"task": "lookdev",
"family": "workfile",
"hierarchy": "Assets",
"project": {"code": "test", "name": "Test"},
"version": 1,
"asset": "Cylinder",
"representation": "mb",
"root": self.ROOT_DIR
},
"dependencies": [],
"name": "mb",
"parent": {"oid": '{}'.format(id)},
"data": {
"path": "C:\\projects\\Test\\Assets\\Cylinder\\publish\\workfile\\workfileLookdev\\{}\\{}".format(version_str, file_name),
"template": "{root}\\{project[name]}\\{hierarchy}\\{asset}\\publish\\{family}\\{subset}\\v{version:0>3}\\{project[code]}_{asset}_{subset}_v{version:0>3}<_{output}><.{frame:0>4}>.{representation}"
},
"type": "representation",
"schema": "pype:representation-2.0"
}
insert_recs.append(document)
print('Prepared {} records in {} collection'.
format(no_of_records, self.collection_name))
self.collection.insert_many(insert_recs)
# TODO refactore to produce real array and not needeing ugly regex
self.collection.insert_one({"inserted_id": self.inserted_ids})
print('-' * 50)
def run(self, queries=1000, loops=3):
'''
Run X'queries' that are searching collection Y'loops' times
:param queries: how many times do ..find(...)
:param loops: loop of testing X queries
:return: None
'''
print('Testing version {} on {}'.format(self.version,
self.collection_name))
print('Queries rung {} in {} loops'.format(queries, loops))
inserted_ids = list(self.collection.
find({"inserted_id": {"$exists": True}}))
import re
self.ids = re.findall("'[0-9a-z]*'", str(inserted_ids))
import time
found_cnt = 0
for _ in range(loops):
print('Starting loop {}'.format(_))
start = time.time()
for _ in range(queries):
# val = random.choice(self.ids)
# val = val.replace("'", '')
val = random.randint(0, 50)
print(val)
if (self.version == 'array'):
# prepared for partial index, without 'files': exists
# wont engage
found = self.collection.\
find({'files': {"$exists": True},
'files.sites.name': "local_{}".format(val)}).\
count()
else:
key = "files.{}".format(val)
found = self.collection.find_one({key: {"$exists": True}})
print("found {} records".format(found))
# if found:
# found_cnt += len(list(found))
end = time.time()
print('duration per loop {}'.format(end - start))
print("found_cnt {}".format(found_cnt))
def get_files(self, mode, i, file_id, file_id2, file_id3,
create_files=False):
'''
Wrapper to decide if 'array' or document version should be used
:param mode: 'array'|'doc'
:param i: step number
:param file_id: ObjectId of first dummy file
:param file_id2: ..
:param file_id3: ..
:return:
'''
if mode == 'array':
return self.get_files_array(i, file_id, file_id2, file_id3,
create_files)
else:
return self.get_files_doc(i, file_id, file_id2, file_id3)
def get_files_array(self, i, file_id, file_id2, file_id3,
create_files=False):
ret = [
{
"path": "{root}" + "/Test/Assets/Cylinder/publish/workfile/" +
"workfileLookdev/v{0:03}/" +
"test_Cylinder_A_workfileLookdev_v{0:03}.dat"
.format(i, i),
"_id": '{}'.format(file_id),
"hash": "temphash",
"sites": self.get_sites(self.MAX_NUMBER_OF_SITES),
"size": random.randint(0, self.MAX_FILE_SIZE_B)
},
{
"path": "{root}" + "/Test/Assets/Cylinder/publish/workfile/" +
"workfileLookdev/v{0:03}/" +
"test_Cylinder_B_workfileLookdev_v{0:03}.dat"
.format(i, i),
"_id": '{}'.format(file_id2),
"hash": "temphash",
"sites": self.get_sites(self.MAX_NUMBER_OF_SITES),
"size": random.randint(0, self.MAX_FILE_SIZE_B)
},
{
"path": "{root}" + "/Test/Assets/Cylinder/publish/workfile/" +
"workfileLookdev/v{0:03}/" +
"test_Cylinder_C_workfileLookdev_v{0:03}.dat"
.format(i, i),
"_id": '{}'.format(file_id3),
"hash": "temphash",
"sites": self.get_sites(self.MAX_NUMBER_OF_SITES),
"size": random.randint(0, self.MAX_FILE_SIZE_B)
}
]
if create_files:
for f in ret:
path = f.get("path").replace("{root}", self.ROOT_DIR)
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'wb') as fp:
fp.write(os.urandom(f.get("size")))
return ret
def get_files_doc(self, i, file_id, file_id2, file_id3):
ret = {}
ret['{}'.format(file_id)] = {
"path": "{root}" +
"/Test/Assets/Cylinder/publish/workfile/workfileLookdev/"
"v001/test_CylinderA_workfileLookdev_v{0:03}.mb".format(i),
"hash": "temphash",
"sites": ["studio"],
"size": 87236
}
ret['{}'.format(file_id2)] = {
"path": "{root}" +
"/Test/Assets/Cylinder/publish/workfile/workfileLookdev/"
"v001/test_CylinderB_workfileLookdev_v{0:03}.mb".format(i),
"hash": "temphash",
"sites": ["studio"],
"size": 87236
}
ret['{}'.format(file_id3)] = {
"path": "{root}" +
"/Test/Assets/Cylinder/publish/workfile/workfileLookdev/"
"v001/test_CylinderC_workfileLookdev_v{0:03}.mb".format(i),
"hash": "temphash",
"sites": ["studio"],
"size": 87236
}
return ret
def get_sites(self, number_of_sites=50):
"""
Return array of sites declaration.
Currently on 1st site has "created_dt" fillled, which should
trigger upload to 'gdrive' site.
'gdrive' site is appended, its destination for syncing for
Sync Server
Args:
number_of_sites:
Returns:
"""
sites = []
for i in range(number_of_sites):
site = {'name': "local_{}".format(i)}
# do not create null 'created_dt' field, Mongo doesnt like it
if i == 0:
site['created_dt'] = datetime.now()
sites.append(site)
sites.append({'name': "gdrive"})
return sites
if __name__ == '__main__':
tp = TestPerformance('array')
tp.prepare(no_of_records=10, create_files=True) # enable to prepare data
# tp.run(10, 3)
# print('-'*50)
#
# tp = TestPerformance('doc')
# tp.prepare() # enable to prepare data
# tp.run(1000, 3)