From d4fb23f79860e45ad537ee3c076b04f957214de9 Mon Sep 17 00:00:00 2001
From: oscar <>
Date: Fri, 29 Nov 2024 14:16:44 -0800
Subject: [PATCH] Initial commit

---
 .gitattributes          |   2 +
 .gitignore              | 160 ++++++++++++++++++++++++++++++++++++++++
 CLEAN_FROM_OLD_DUPES.py |  24 ++++++
 config.py               |  27 +++++++
 dump_facebook.py        | 105 ++++++++++++++++++++++++++
 dump_instagram.py       |  77 +++++++++++++++++++
 dump_missing_data.py    |  82 ++++++++++++++++++++
 dump_tiktok.py          |  67 +++++++++++++++++
 dupes_by_hash copy.py   |  13 ++++
 dupes_by_hash.py        |  22 ++++++
 10 files changed, 579 insertions(+)
 create mode 100644 .gitattributes
 create mode 100644 .gitignore
 create mode 100644 CLEAN_FROM_OLD_DUPES.py
 create mode 100644 config.py
 create mode 100644 dump_facebook.py
 create mode 100644 dump_instagram.py
 create mode 100644 dump_missing_data.py
 create mode 100644 dump_tiktok.py
 create mode 100644 dupes_by_hash copy.py
 create mode 100644 dupes_by_hash.py

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..dfe0770
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+# Auto detect text files and perform LF normalization
+* text=auto
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..68bc17f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,160 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
diff --git a/CLEAN_FROM_OLD_DUPES.py b/CLEAN_FROM_OLD_DUPES.py
new file mode 100644
index 0000000..0bb75ef
--- /dev/null
+++ b/CLEAN_FROM_OLD_DUPES.py
@@ -0,0 +1,24 @@
+import config
+
+altpins_db, altpins_cursor = config.altpins_gen_connection()
+db, cursor = config.gen_connection()
+
+altpins_cursor.execute("SELECT id, title, hash, url FROM pins WHERE hash IS NOT NULL;")
+altpins_results = { (row[1], row[2]): (row[0], row[3]) for row in altpins_cursor.fetchall() }
+
+cursor.execute("SELECT id, username, hash, media_url FROM media WHERE hash IS NOT NULL;")
+media_results = { (row[1], row[2]): (row[0], row[3]) for row in cursor.fetchall() }
+
+common_items = set(altpins_results.keys()) & set(media_results.keys())
+
+for title, hash_value in common_items:
+    altpins_id, altpins_url = altpins_results[(title, hash_value)]
+    media_id, media_url = media_results[(title, hash_value)]
+
+    print(f"Found a match for hash {hash_value} with title {title}")
+    print(f"Altpins URL: {altpins_url}")
+    print(f"Media URL: {media_url}")
+
+    altpins_cursor.execute("DELETE FROM pins WHERE id = %s;", [altpins_id])
+    altpins_db.commit()
+    print(f"Deleted pin {altpins_id}. {altpins_cursor.rowcount} rows affected")
\ No newline at end of file
diff --git a/config.py b/config.py
new file mode 100644
index 0000000..4c51f7c
--- /dev/null
+++ b/config.py
@@ -0,0 +1,27 @@
+import mysql.connector
+
+altpins_username = "xantorn"
+altpins_password = "AVNS_lGiLOVTTyGMtoOoRn5Q"
+altpins_host = "archivebate-db-do-user-13308724-0.b.db.ondigitalocean.com"
+altpins_port = 25060
+altpins_database = "altpins"
+altpins_sslmode = "REQUIRED"
+
+def altpins_gen_connection():
+    print("Connecting to database")
+    newDB = mysql.connector.connect(host=altpins_host, user=altpins_username, password=altpins_password, database=altpins_database, port=altpins_port)
+    print("Connected to database")
+    return newDB, newDB.cursor()
+
+username = "doadmin"
+password = "AVNS_KNXK1IjScgTCe09gI9F"
+host = "storysave-do-user-13308724-0.c.db.ondigitalocean.com"
+port = 25060
+database = "storysave"
+sslmode = "REQUIRED"
+
+def gen_connection():
+    print("Connecting to database")
+    newDB = mysql.connector.connect(host=host, user=username, password=password, database=database, port=port)
+    print("Connected to database")
+    return newDB, newDB.cursor()
diff --git a/dump_facebook.py b/dump_facebook.py
new file mode 100644
index 0000000..ae59428
--- /dev/null
+++ b/dump_facebook.py
@@ -0,0 +1,105 @@
+from BunnyCDN.Storage import Storage
+import os, uuid, config, funcs
+from datetime import datetime
+from PIL import Image
+
+def dump_facebook(folder_path):
+    for filename in os.listdir(folder_path):
+        if os.path.isdir(os.path.join(folder_path, filename)):
+            continue
+        
+        username = filename.split("'")[0]
+
+        filepath = os.path.join(folder_path, filename)
+        
+        mediatype = funcs.get_media_type(filename)
+        post_type = funcs.determine_post_type(filepath, mediatype)
+
+        upload_file(username=username, media_type=mediatype, filepath=filepath, post_type=post_type)
+
+    for folder in os.listdir(folder_path):
+        if os.path.isdir(os.path.join(folder_path, folder)):
+            username = folder
+            
+            for filename in os.listdir(os.path.join(folder_path, folder)):
+                filepath = os.path.join(folder_path, folder, filename)
+                
+                mediatype = funcs.get_media_type(filename)
+                post_type = funcs.determine_post_type(filepath, mediatype)
+
+                upload_file(username=username, media_type=mediatype, filepath=filepath, post_type=post_type)
+                
+def upload_file(filepath, username, media_type='image', post_type='story', timestamp=None, user_id=None):
+    filename = os.path.basename(filepath)
+    file_extension = os.path.splitext(filename)[1].lower()
+
+    file_hash = funcs.calculate_file_hash(filepath)
+
+    if file_hash in existing_files:
+        print('Duplicate file detected. Removing...')
+        os.remove(filepath)
+        return False
+
+    duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0
+
+    if "FB_IMG" in filename:
+        media_id = filename.split("_")[2].split(".")[0]
+    else:
+        media_id = uuid.uuid4().hex
+
+    dirtype = funcs.determine_post_type(filepath, media_type)
+    server_path = os.path.join('media', dirtype, username, f'{media_id}{file_extension}')
+        
+    try:
+        obj_storage.PutFile(filepath, server_path)
+    except Exception as e:
+        print(f"Failed to upload {filepath} to storage: {e}")
+        return False
+
+    file_url = f"https://storysave.b-cdn.net/{server_path}"
+
+    if media_type == 'image':
+        with Image.open(filepath) as img:
+            width, height = img.size
+    else:
+        width, height = funcs.get_video_dimensions(filepath)
+
+    post_date = datetime.fromtimestamp(int(timestamp)) if timestamp else datetime.now()
+
+    if post_type == 'stories':
+        post_type = 'story'
+    else:
+        post_type = 'post'
+    
+    query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, user_id, platform, hash, filename, duration) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
+    values = (username, media_type, file_url, width, height, post_type, post_date, user_id, 'facebook', file_hash, filename, duration)
+
+    try:
+        newCursor.execute(query, values)
+        newDB.commit()
+        print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
+    except Exception as e:
+        print(f"Database error: {e}")
+        return False
+
+    try:
+        if newCursor.rowcount > 0:
+            os.remove(filepath)
+    except Exception as e:
+        print(f"Failed to remove local file {filepath}: {e}")
+
+    return True
+
+if __name__ == '__main__':
+    print('Starting processing...')
+
+    newDB, newCursor = config.gen_connection()
+
+    obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
+
+    newCursor.execute("SELECT hash FROM media WHERE platform='facebook' AND hash IS NOT NULL")
+    existing_files = [image[0] for image in newCursor.fetchall()]
+
+    dump_facebook('facebook/')
+
+    print("Processing completed.")
\ No newline at end of file
diff --git a/dump_instagram.py b/dump_instagram.py
new file mode 100644
index 0000000..35bda77
--- /dev/null
+++ b/dump_instagram.py
@@ -0,0 +1,77 @@
+from BunnyCDN.Storage import Storage
+from datetime import datetime
+import os, config, funcs
+from PIL import Image
+
+def dump_instagram(folder_path):
+    for filename in os.listdir(folder_path):
+        parts = filename.split('_')
+        if len(parts) < 4:
+            continue 
+
+        try:
+            username = '_'.join(parts[:-3])
+            timestamp = int(parts[-3])
+            media_id = int(parts[-2])
+            user_id = int(parts[-1].split('.')[0])
+        except:
+            print(f"Invalid filename: {filename}")
+            os.rename(os.path.join(folder_path, filename), os.path.join(folder_path, 'sort', filename))
+            continue
+        
+        filepath = os.path.join(folder_path, filename)
+        UploadMedia(username=username, filepath=filepath, media_id=media_id, timestamp=timestamp, user_id=user_id)
+
+
+def UploadMedia(filepath, username, media_id=None, timestamp=None, user_id=None):
+    filename = os.path.basename(filepath)
+    file_extension = os.path.splitext(filename)[1].lower()
+
+    media_type = funcs.get_media_type(filename)
+
+    post_type = funcs.determine_post_type(filepath, media_type)
+
+    file_hash = funcs.calculate_file_hash(filepath)
+
+    duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0
+
+    post_date = datetime.fromtimestamp(int(timestamp)) if timestamp else datetime.now()
+
+    width, height = funcs.get_video_dimensions(filepath) if media_type == 'video' else Image.open(filepath).size
+
+    server_path = f'media/{post_type}/{username}/{media_id}{file_extension}'
+
+    file_url = f"https://storysave.b-cdn.net/{server_path}"
+
+    if media_id and int(media_id) in existing_files:
+        print('Duplicate file detected. Removing...')
+        os.remove(filepath)
+        return True
+
+    obj_storage.PutFile(filepath, server_path)
+
+    query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
+    values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration)
+
+    newCursor.execute(query, values)
+    newDB.commit()
+    print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
+
+    os.remove(filepath)
+
+    return True
+ 
+
+if __name__ == '__main__':
+    print('Starting processing...')
+
+    newDB, newCursor = config.gen_connection()
+
+    obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
+
+    newCursor.execute("SELECT media_id FROM media WHERE platform='instagram' AND media_id IS NOT NULL")
+    existing_files = [image[0] for image in newCursor.fetchall()]
+
+    dump_instagram('storysaver/')
+
+    print("Processing completed.")
\ No newline at end of file
diff --git a/dump_missing_data.py b/dump_missing_data.py
new file mode 100644
index 0000000..decb148
--- /dev/null
+++ b/dump_missing_data.py
@@ -0,0 +1,82 @@
+from BunnyCDN.Storage import Storage
+from datetime import datetime
+import os, config, funcs
+from PIL import Image
+
+def dump_instagram(folder_path):
+    for filename in os.listdir(folder_path):
+        parts = filename.split('_')
+        
+        try:
+            username = '_'.join(parts[:-2])  # Join all except last two
+            timestamp = int(parts[-2])  # Second last is timestamp
+            user_id = int(parts[-1].split('.')[0])  # Last part before extension is user_id
+        except ValueError as e:
+            print(f"Invalid filename: {filename}. Error: {e}")
+            continue
+        
+        filepath = os.path.join(folder_path, filename)
+
+        mediatype = funcs.get_media_type(filename)
+        post_type = funcs.determine_post_type(filepath, mediatype)
+
+        UploadMedia(username=username, media_type=mediatype, filepath=filepath, post_type=post_type, timestamp=timestamp, user_id=user_id)
+
+
+def UploadMedia(filepath, username, media_type='image', post_type='story', timestamp=None, user_id=None):
+    if 'tero' in username:
+        pass
+    
+    filename = os.path.basename(filepath)
+    file_extension = os.path.splitext(filename)[1].lower()
+
+    file_hash = funcs.calculate_file_hash(filepath)
+
+    duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0
+
+    post_date = datetime.fromtimestamp(int(timestamp)) if timestamp else datetime.now()
+
+    dirtype = funcs.determine_post_type(filepath, media_type)
+
+    server_path = f'media/{dirtype}/{username}/{file_hash}{file_extension}'
+
+    file_url = f"https://storysave.b-cdn.net/{server_path}"
+
+    if file_hash in existing_files:
+        print('Duplicate file detected. Removing...')
+        os.remove(filepath)
+        return True
+
+    obj_storage.PutFile(filepath, server_path)
+
+    if media_type == 'image':
+        with Image.open(filepath) as img:
+            width, height = img.size
+    else:
+        width, height = funcs.get_video_dimensions(filepath)
+
+    query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, user_id, hash, filename, duration) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
+    values = (username, media_type, file_url, width, height, post_type, post_date, user_id, file_hash, filename, duration)
+
+    newCursor.execute(query, values)
+    newDB.commit()
+    print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
+
+    os.remove(filepath)
+
+    return True
+ 
+
+if __name__ == '__main__':
+    print('Starting processing...')
+
+    newDB, newCursor = config.gen_connection()
+
+    obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
+
+    newCursor.execute("SELECT hash FROM media WHERE platform='instagram' AND hash IS NOT NULL")
+    existing_files = [image[0] for image in newCursor.fetchall()]
+
+    dump_instagram('storysaver/missing/')
+
+    print("Processing completed.")
\ No newline at end of file
diff --git a/dump_tiktok.py b/dump_tiktok.py
new file mode 100644
index 0000000..35b0fa1
--- /dev/null
+++ b/dump_tiktok.py
@@ -0,0 +1,67 @@
+from BunnyCDN.Storage import Storage
+import os, uuid, config, funcs
+from datetime import datetime
+from PIL import Image
+
+def dump_facebook(folder_path):
+    for folder in os.listdir(folder_path):
+        if os.path.isdir(os.path.join(folder_path, folder)):
+            username = folder
+            
+            for filename in os.listdir(os.path.join(folder_path, folder)):
+                filepath = os.path.join(folder_path, folder, filename)
+                
+                upload_file(username=username, filepath=filepath)
+                
+def upload_file(filepath, username):
+    filename = os.path.basename(filepath)
+    media_id = filename.split('.')[0]
+    
+    file_extension = os.path.splitext(filename)[1].lower()
+
+    media_type = funcs.get_media_type(filename)
+
+    file_hash = funcs.calculate_file_hash(filepath)
+
+    duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0
+    
+    width, height = funcs.get_video_dimensions(filepath) if media_type == 'video' else Image.open(filepath).size
+
+
+    dirtype = funcs.determine_post_type(filepath, media_type)
+    server_path = os.path.join('media', dirtype, username, f'{media_id}{file_extension}')
+        
+    obj_storage.PutFile(filepath, server_path)
+
+    file_url = f"https://storysave.b-cdn.net/{server_path}"
+
+    if file_hash in existing_files:
+        print('Duplicate file detected. Removing...')
+        os.remove(filepath)
+        return False
+
+    query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, platform, hash, filename, duration) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
+    values = (username, media_type, file_url, width, height, 'tiktok', file_hash, filename, duration)
+
+    newCursor.execute(query, values)
+    newDB.commit()
+    print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
+
+    if newCursor.rowcount > 0:
+        os.remove(filepath)
+
+    return True
+
+if __name__ == '__main__':
+    print('Starting processing...')
+
+    newDB, newCursor = config.gen_connection()
+
+    obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
+
+    newCursor.execute("SELECT hash FROM media WHERE platform='tiktok' AND hash IS NOT NULL")
+    existing_files = [image[0] for image in newCursor.fetchall()]
+
+    dump_facebook('tiktok/')
+
+    print("Processing completed.")
\ No newline at end of file
diff --git a/dupes_by_hash copy.py b/dupes_by_hash copy.py
new file mode 100644
index 0000000..d04ce47
--- /dev/null
+++ b/dupes_by_hash copy.py	
@@ -0,0 +1,13 @@
+from BunnyCDN.Storage import Storage
+from datetime import datetime
+import os, config, funcs
+from PIL import Image 
+
+if __name__ == '__main__':
+    print('Starting processing...')
+
+    files = os.listdir('')
+
+    for file in files:
+        filePath = os.path.join('storysaver/missing_data/', file)
+        file_hash = funcs.calculate_file_hash(filePath)
\ No newline at end of file
diff --git a/dupes_by_hash.py b/dupes_by_hash.py
new file mode 100644
index 0000000..16187fe
--- /dev/null
+++ b/dupes_by_hash.py
@@ -0,0 +1,22 @@
+from BunnyCDN.Storage import Storage
+from datetime import datetime
+import os, config, funcs
+from PIL import Image 
+
+if __name__ == '__main__':
+    print('Starting processing...')
+
+    newDB, newCursor = config.gen_connection()
+
+    newCursor.execute("SELECT hash FROM media WHERE platform='instagram' AND hash IS NOT NULL")
+    existing_files = [image[0] for image in newCursor.fetchall()]
+
+    files = os.listdir('storysaver/missing_data/')
+
+    for file in files:
+        filePath = os.path.join('storysaver/missing_data/', file)
+        file_hash = funcs.calculate_file_hash(filePath)
+        
+        if file_hash in existing_files:
+            print(f'Duplicate file detected. Removing {filePath}...')
+            os.rename(filePath, f'storysaver/dupes/{file}')
\ No newline at end of file