From d4fb23f79860e45ad537ee3c076b04f957214de9 Mon Sep 17 00:00:00 2001 From: oscar <> Date: Fri, 29 Nov 2024 14:16:44 -0800 Subject: [PATCH] Initial commit --- .gitattributes | 2 + .gitignore | 160 ++++++++++++++++++++++++++++++++++++++++ CLEAN_FROM_OLD_DUPES.py | 24 ++++++ config.py | 27 +++++++ dump_facebook.py | 105 ++++++++++++++++++++++++++ dump_instagram.py | 77 +++++++++++++++++++ dump_missing_data.py | 82 ++++++++++++++++++++ dump_tiktok.py | 67 +++++++++++++++++ dupes_by_hash copy.py | 13 ++++ dupes_by_hash.py | 22 ++++++ 10 files changed, 579 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 CLEAN_FROM_OLD_DUPES.py create mode 100644 config.py create mode 100644 dump_facebook.py create mode 100644 dump_instagram.py create mode 100644 dump_missing_data.py create mode 100644 dump_tiktok.py create mode 100644 dupes_by_hash copy.py create mode 100644 dupes_by_hash.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..dfe0770 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# Auto detect text files and perform LF normalization +* text=auto diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..68bc17f --- /dev/null +++ b/.gitignore @@ -0,0 +1,160 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/CLEAN_FROM_OLD_DUPES.py b/CLEAN_FROM_OLD_DUPES.py new file mode 100644 index 0000000..0bb75ef --- /dev/null +++ b/CLEAN_FROM_OLD_DUPES.py @@ -0,0 +1,24 @@ +import config + +altpins_db, altpins_cursor = config.altpins_gen_connection() +db, cursor = config.gen_connection() + +altpins_cursor.execute("SELECT id, title, hash, url FROM pins WHERE hash IS NOT NULL;") +altpins_results = { (row[1], row[2]): (row[0], row[3]) for row in altpins_cursor.fetchall() } + +cursor.execute("SELECT id, username, hash, media_url FROM media WHERE hash IS NOT NULL;") +media_results = { (row[1], row[2]): (row[0], row[3]) for row in cursor.fetchall() } + +common_items = set(altpins_results.keys()) & set(media_results.keys()) + +for title, hash_value in common_items: + altpins_id, altpins_url = altpins_results[(title, hash_value)] + media_id, media_url = media_results[(title, hash_value)] + + print(f"Found a match for hash {hash_value} with title {title}") + print(f"Altpins URL: {altpins_url}") + print(f"Media URL: {media_url}") + + altpins_cursor.execute("DELETE FROM pins WHERE id = %s;", [altpins_id]) + altpins_db.commit() + print(f"Deleted pin {altpins_id}. {altpins_cursor.rowcount} rows affected") \ No newline at end of file diff --git a/config.py b/config.py new file mode 100644 index 0000000..4c51f7c --- /dev/null +++ b/config.py @@ -0,0 +1,27 @@ +import mysql.connector + +altpins_username = "xantorn" +altpins_password = "AVNS_lGiLOVTTyGMtoOoRn5Q" +altpins_host = "archivebate-db-do-user-13308724-0.b.db.ondigitalocean.com" +altpins_port = 25060 +altpins_database = "altpins" +altpins_sslmode = "REQUIRED" + +def altpins_gen_connection(): + print("Connecting to database") + newDB = mysql.connector.connect(host=altpins_host, user=altpins_username, password=altpins_password, database=altpins_database, port=altpins_port) + print("Connected to database") + return newDB, newDB.cursor() + +username = "doadmin" +password = "AVNS_KNXK1IjScgTCe09gI9F" +host = "storysave-do-user-13308724-0.c.db.ondigitalocean.com" +port = 25060 +database = "storysave" +sslmode = "REQUIRED" + +def gen_connection(): + print("Connecting to database") + newDB = mysql.connector.connect(host=host, user=username, password=password, database=database, port=port) + print("Connected to database") + return newDB, newDB.cursor() diff --git a/dump_facebook.py b/dump_facebook.py new file mode 100644 index 0000000..ae59428 --- /dev/null +++ b/dump_facebook.py @@ -0,0 +1,105 @@ +from BunnyCDN.Storage import Storage +import os, uuid, config, funcs +from datetime import datetime +from PIL import Image + +def dump_facebook(folder_path): + for filename in os.listdir(folder_path): + if os.path.isdir(os.path.join(folder_path, filename)): + continue + + username = filename.split("'")[0] + + filepath = os.path.join(folder_path, filename) + + mediatype = funcs.get_media_type(filename) + post_type = funcs.determine_post_type(filepath, mediatype) + + upload_file(username=username, media_type=mediatype, filepath=filepath, post_type=post_type) + + for folder in os.listdir(folder_path): + if os.path.isdir(os.path.join(folder_path, folder)): + username = folder + + for filename in os.listdir(os.path.join(folder_path, folder)): + filepath = os.path.join(folder_path, folder, filename) + + mediatype = funcs.get_media_type(filename) + post_type = funcs.determine_post_type(filepath, mediatype) + + upload_file(username=username, media_type=mediatype, filepath=filepath, post_type=post_type) + +def upload_file(filepath, username, media_type='image', post_type='story', timestamp=None, user_id=None): + filename = os.path.basename(filepath) + file_extension = os.path.splitext(filename)[1].lower() + + file_hash = funcs.calculate_file_hash(filepath) + + if file_hash in existing_files: + print('Duplicate file detected. Removing...') + os.remove(filepath) + return False + + duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0 + + if "FB_IMG" in filename: + media_id = filename.split("_")[2].split(".")[0] + else: + media_id = uuid.uuid4().hex + + dirtype = funcs.determine_post_type(filepath, media_type) + server_path = os.path.join('media', dirtype, username, f'{media_id}{file_extension}') + + try: + obj_storage.PutFile(filepath, server_path) + except Exception as e: + print(f"Failed to upload {filepath} to storage: {e}") + return False + + file_url = f"https://storysave.b-cdn.net/{server_path}" + + if media_type == 'image': + with Image.open(filepath) as img: + width, height = img.size + else: + width, height = funcs.get_video_dimensions(filepath) + + post_date = datetime.fromtimestamp(int(timestamp)) if timestamp else datetime.now() + + if post_type == 'stories': + post_type = 'story' + else: + post_type = 'post' + + query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, user_id, platform, hash, filename, duration) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" + values = (username, media_type, file_url, width, height, post_type, post_date, user_id, 'facebook', file_hash, filename, duration) + + try: + newCursor.execute(query, values) + newDB.commit() + print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}') + except Exception as e: + print(f"Database error: {e}") + return False + + try: + if newCursor.rowcount > 0: + os.remove(filepath) + except Exception as e: + print(f"Failed to remove local file {filepath}: {e}") + + return True + +if __name__ == '__main__': + print('Starting processing...') + + newDB, newCursor = config.gen_connection() + + obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave') + + newCursor.execute("SELECT hash FROM media WHERE platform='facebook' AND hash IS NOT NULL") + existing_files = [image[0] for image in newCursor.fetchall()] + + dump_facebook('facebook/') + + print("Processing completed.") \ No newline at end of file diff --git a/dump_instagram.py b/dump_instagram.py new file mode 100644 index 0000000..35bda77 --- /dev/null +++ b/dump_instagram.py @@ -0,0 +1,77 @@ +from BunnyCDN.Storage import Storage +from datetime import datetime +import os, config, funcs +from PIL import Image + +def dump_instagram(folder_path): + for filename in os.listdir(folder_path): + parts = filename.split('_') + if len(parts) < 4: + continue + + try: + username = '_'.join(parts[:-3]) + timestamp = int(parts[-3]) + media_id = int(parts[-2]) + user_id = int(parts[-1].split('.')[0]) + except: + print(f"Invalid filename: {filename}") + os.rename(os.path.join(folder_path, filename), os.path.join(folder_path, 'sort', filename)) + continue + + filepath = os.path.join(folder_path, filename) + UploadMedia(username=username, filepath=filepath, media_id=media_id, timestamp=timestamp, user_id=user_id) + + +def UploadMedia(filepath, username, media_id=None, timestamp=None, user_id=None): + filename = os.path.basename(filepath) + file_extension = os.path.splitext(filename)[1].lower() + + media_type = funcs.get_media_type(filename) + + post_type = funcs.determine_post_type(filepath, media_type) + + file_hash = funcs.calculate_file_hash(filepath) + + duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0 + + post_date = datetime.fromtimestamp(int(timestamp)) if timestamp else datetime.now() + + width, height = funcs.get_video_dimensions(filepath) if media_type == 'video' else Image.open(filepath).size + + server_path = f'media/{post_type}/{username}/{media_id}{file_extension}' + + file_url = f"https://storysave.b-cdn.net/{server_path}" + + if media_id and int(media_id) in existing_files: + print('Duplicate file detected. Removing...') + os.remove(filepath) + return True + + obj_storage.PutFile(filepath, server_path) + + query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" + values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration) + + newCursor.execute(query, values) + newDB.commit() + print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}') + + os.remove(filepath) + + return True + + +if __name__ == '__main__': + print('Starting processing...') + + newDB, newCursor = config.gen_connection() + + obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave') + + newCursor.execute("SELECT media_id FROM media WHERE platform='instagram' AND media_id IS NOT NULL") + existing_files = [image[0] for image in newCursor.fetchall()] + + dump_instagram('storysaver/') + + print("Processing completed.") \ No newline at end of file diff --git a/dump_missing_data.py b/dump_missing_data.py new file mode 100644 index 0000000..decb148 --- /dev/null +++ b/dump_missing_data.py @@ -0,0 +1,82 @@ +from BunnyCDN.Storage import Storage +from datetime import datetime +import os, config, funcs +from PIL import Image + +def dump_instagram(folder_path): + for filename in os.listdir(folder_path): + parts = filename.split('_') + + try: + username = '_'.join(parts[:-2]) # Join all except last two + timestamp = int(parts[-2]) # Second last is timestamp + user_id = int(parts[-1].split('.')[0]) # Last part before extension is user_id + except ValueError as e: + print(f"Invalid filename: {filename}. Error: {e}") + continue + + filepath = os.path.join(folder_path, filename) + + mediatype = funcs.get_media_type(filename) + post_type = funcs.determine_post_type(filepath, mediatype) + + UploadMedia(username=username, media_type=mediatype, filepath=filepath, post_type=post_type, timestamp=timestamp, user_id=user_id) + + +def UploadMedia(filepath, username, media_type='image', post_type='story', timestamp=None, user_id=None): + if 'tero' in username: + pass + + filename = os.path.basename(filepath) + file_extension = os.path.splitext(filename)[1].lower() + + file_hash = funcs.calculate_file_hash(filepath) + + duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0 + + post_date = datetime.fromtimestamp(int(timestamp)) if timestamp else datetime.now() + + dirtype = funcs.determine_post_type(filepath, media_type) + + server_path = f'media/{dirtype}/{username}/{file_hash}{file_extension}' + + file_url = f"https://storysave.b-cdn.net/{server_path}" + + if file_hash in existing_files: + print('Duplicate file detected. Removing...') + os.remove(filepath) + return True + + obj_storage.PutFile(filepath, server_path) + + if media_type == 'image': + with Image.open(filepath) as img: + width, height = img.size + else: + width, height = funcs.get_video_dimensions(filepath) + + query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, user_id, hash, filename, duration) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" + values = (username, media_type, file_url, width, height, post_type, post_date, user_id, file_hash, filename, duration) + + newCursor.execute(query, values) + newDB.commit() + print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}') + + os.remove(filepath) + + return True + + +if __name__ == '__main__': + print('Starting processing...') + + newDB, newCursor = config.gen_connection() + + obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave') + + newCursor.execute("SELECT hash FROM media WHERE platform='instagram' AND hash IS NOT NULL") + existing_files = [image[0] for image in newCursor.fetchall()] + + dump_instagram('storysaver/missing/') + + print("Processing completed.") \ No newline at end of file diff --git a/dump_tiktok.py b/dump_tiktok.py new file mode 100644 index 0000000..35b0fa1 --- /dev/null +++ b/dump_tiktok.py @@ -0,0 +1,67 @@ +from BunnyCDN.Storage import Storage +import os, uuid, config, funcs +from datetime import datetime +from PIL import Image + +def dump_facebook(folder_path): + for folder in os.listdir(folder_path): + if os.path.isdir(os.path.join(folder_path, folder)): + username = folder + + for filename in os.listdir(os.path.join(folder_path, folder)): + filepath = os.path.join(folder_path, folder, filename) + + upload_file(username=username, filepath=filepath) + +def upload_file(filepath, username): + filename = os.path.basename(filepath) + media_id = filename.split('.')[0] + + file_extension = os.path.splitext(filename)[1].lower() + + media_type = funcs.get_media_type(filename) + + file_hash = funcs.calculate_file_hash(filepath) + + duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0 + + width, height = funcs.get_video_dimensions(filepath) if media_type == 'video' else Image.open(filepath).size + + + dirtype = funcs.determine_post_type(filepath, media_type) + server_path = os.path.join('media', dirtype, username, f'{media_id}{file_extension}') + + obj_storage.PutFile(filepath, server_path) + + file_url = f"https://storysave.b-cdn.net/{server_path}" + + if file_hash in existing_files: + print('Duplicate file detected. Removing...') + os.remove(filepath) + return False + + query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, platform, hash, filename, duration) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)" + values = (username, media_type, file_url, width, height, 'tiktok', file_hash, filename, duration) + + newCursor.execute(query, values) + newDB.commit() + print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}') + + if newCursor.rowcount > 0: + os.remove(filepath) + + return True + +if __name__ == '__main__': + print('Starting processing...') + + newDB, newCursor = config.gen_connection() + + obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave') + + newCursor.execute("SELECT hash FROM media WHERE platform='tiktok' AND hash IS NOT NULL") + existing_files = [image[0] for image in newCursor.fetchall()] + + dump_facebook('tiktok/') + + print("Processing completed.") \ No newline at end of file diff --git a/dupes_by_hash copy.py b/dupes_by_hash copy.py new file mode 100644 index 0000000..d04ce47 --- /dev/null +++ b/dupes_by_hash copy.py @@ -0,0 +1,13 @@ +from BunnyCDN.Storage import Storage +from datetime import datetime +import os, config, funcs +from PIL import Image + +if __name__ == '__main__': + print('Starting processing...') + + files = os.listdir('') + + for file in files: + filePath = os.path.join('storysaver/missing_data/', file) + file_hash = funcs.calculate_file_hash(filePath) \ No newline at end of file diff --git a/dupes_by_hash.py b/dupes_by_hash.py new file mode 100644 index 0000000..16187fe --- /dev/null +++ b/dupes_by_hash.py @@ -0,0 +1,22 @@ +from BunnyCDN.Storage import Storage +from datetime import datetime +import os, config, funcs +from PIL import Image + +if __name__ == '__main__': + print('Starting processing...') + + newDB, newCursor = config.gen_connection() + + newCursor.execute("SELECT hash FROM media WHERE platform='instagram' AND hash IS NOT NULL") + existing_files = [image[0] for image in newCursor.fetchall()] + + files = os.listdir('storysaver/missing_data/') + + for file in files: + filePath = os.path.join('storysaver/missing_data/', file) + file_hash = funcs.calculate_file_hash(filePath) + + if file_hash in existing_files: + print(f'Duplicate file detected. Removing {filePath}...') + os.rename(filePath, f'storysaver/dupes/{file}') \ No newline at end of file