From 322e39b51f1ea25da4b9f6d370457be5a1cc91ec Mon Sep 17 00:00:00 2001 From: oscar Date: Sun, 1 Dec 2024 17:02:09 +0200 Subject: [PATCH] update --- .DS_Store | Bin 20484 -> 18436 bytes .gitignore | 1 + .../dedupe_phash.py | 0 .../dupecleaner_phash.py | 0 .../find_by_phash.py | 0 .../find_duplicates_by_phash.py | 0 .../find_duplicates_by_phash_videos.py | 0 .../image_dupe_cleaner.py | 0 dedupe_scripts/snappy_duplicates.py | 109 ++++++++++++++++++ funcs.py | 3 - .../organize_tiktoks.py | 0 snappy.py => old/snappy.py | 26 ++++- update_snap_id.py => old/update_snap_id.py | 0 snapchat.py | 2 +- snappy_master.py | 86 +++----------- storysave_dump_tiktok.py | 2 +- storysave_dump_tiktok_process.py | 4 - storysave_dump_unknown.py | 10 +- 18 files changed, 163 insertions(+), 80 deletions(-) rename dedupe_phash.py => dedupe_scripts/dedupe_phash.py (100%) rename dupecleaner_phash.py => dedupe_scripts/dupecleaner_phash.py (100%) rename find_by_phash.py => dedupe_scripts/find_by_phash.py (100%) rename find_duplicates_by_phash.py => dedupe_scripts/find_duplicates_by_phash.py (100%) rename find_duplicates_by_phash_videos.py => dedupe_scripts/find_duplicates_by_phash_videos.py (100%) rename image_dupe_cleaner.py => dedupe_scripts/image_dupe_cleaner.py (100%) create mode 100644 dedupe_scripts/snappy_duplicates.py rename organize_tiktoks.py => old/organize_tiktoks.py (100%) rename snappy.py => old/snappy.py (87%) rename update_snap_id.py => old/update_snap_id.py (100%) diff --git a/.DS_Store b/.DS_Store index de03c36febe233a79b284312f8a5bdffebfedd1f..dd8492d8676faccc4e9406f245ecfd15b240093d 100644 GIT binary patch delta 1908 zcmbVNUrbYH6hF83a^+7M7p2f%aa<_27y2>jJt#{2KS_$+5Kn+bL;-N?^H2cj} zboB{ECT7BpW(z(Nr=>REk!EGu{fWobthl`%HwNt9$|Bk#LIPqi0Z9mhs;#OOgrARA zm`XS@(XY5nt1PY-G>7>G&(&+Ur7ui)+L8Y_G*#1GYszq=e}nebRxigV#=gVZ_EBb# zBBSn|Be7_F>0GOTZ!I#uw3LeCwyDScDO|KTvQt@#@7HfvfVNw;1Jzwz-bl39HTSu} zqW}8obRiu|6yniXcQ;E23!~L{`annOL8sBK7PK26h!RS25*Js0n+ssx>&1{(DyqYwKr7huioOc z@4ru1GXp+c-^D}lELJ+ARlQ#ZT$739L>hBH$!8qGIN>;#b2w93i2y1L6Dh-}_M>d+A%Q8!sl4E33cBfG0PMI#@fO^#PhQv4=-0n3 z@m9cR1`^R&mdp=xf`ox!3UMW12X{`4M`Ginw2yt*Rhc|#bxjJNn;E9);~VeI9=ysm zpt;4mHAEE#cw#yNsy@oQwqGL4QD&^!aF(n%)M7IocHwl3jAuOYI*E7*lOj|&L_F-w+PLL@t*j7E+P_A{R)nt>c7BpeA&2nWBvzvK0~i(41c-^6Xl*}``o*^Pi>QePf9Jk8Z_d0w zGqcE2zKFmMr!Y1CK2C0FpxEJyT$pQefn8BV|Z(7z=V8$-u84ZVWzUXj7%2go@gc$ zO${2|WBZfQej~X()otc-=J4(WYRI+H;(F83SRysJc*BR5=IR%BEw#?VwAeZUI=1A` z>{9KCM{~x;c-)ACq0hRyxNDnzJ48!pJ$;^b(GVS_Z_o*Pm0qVe>34L6-lKmirD{}z znyu!k`D&?JsaB~^Tbwlvx4bR1iu_SMgfl=ea5@jaw)*-fr1@$t^JE1i~E=bfuox?A>))q=p5J&RE? zZ)FjTng)%Ec1^LmP&d2BLk(6vtH){&RC}P>15OVxKD1-!vdbJa|0EPar=M~yC z`m2pzns`~7^g;2QQ&qXf5gG=!pM;+S-JTN?=9iQZfgJdylt7()q^Z5G`m~pk zmV=}r-Qpp%GY!Ajgs*6TE9)npUPCC?Bx<{$jQkP!n?u-58pYA4#`FbO`&)b^W>Kps zB(V3I@sp$Ob;+6OBPX+YM^DE{HkW~>>hNS!%OmooE!MR9D&(KET7K;s_@FL^dvgv>An!!uegXaBu4`vI7&gj8_GzXes)zHkj5fL=`k&DS`ZN8F z{-tK9u)0SrQ>)cy)CRRheNpXDF*U4GYSa?o#PuJye&>(s2+n-9E@ArVe{4hA)}Kzo zb?)6W9jCvf;o4W`uvviZaA*z6|( z`k>kdA)!vE4gmKq067J^PYM8W=mcoo&PfJ_R8E?pJ+eeuFGN@;3B1w8a6S^$5C=h7(>0whL#oF4?PG6a0!b#n}7dqS~jw6e0=4) zd^$2PX~*eCY=KET;1UJ2PHIdY8+dIdqw4CSwxl)_usikh!6Md*eP<>sR zX3FYT?SX0!{2%oI4@q@!p9T?c<|Keo^SY18dlIs6J+}Usb|f8M2~vLdB`ndR4?#k6 z^3TvKOk`qb4it%E@7LLJLf%lg_m+(!L%%D|Cx5D6N<(V!zZ<6KPE6|+Zm285^# zmrF+%&|0aZGTg#}xV>HSGVPq4qdvcF{zn_m-+-^}*Bh#PBlMlA=rB9%Iw-Y*b9U4> zQxD$W937|U>HEl+{gN&rJpYUS+&)t^t5)?1b(iW;_p3E(1Dk6ZN4<;&rB&CRXhD!! z&N8HRUokJ;y}`2XGS88?TRE4s_PQ5e}oB2P$xuZUAU72we4KU~ASurnKxj zgby4%MDG|Ic3LELJ@3fAV`#Z^Pipn#r9aibkq_1E-jMq9yDv30?+h!dAls#rM)WY} zpgCaUktMBX@MLlH)6WkeN$RJ?;fx{7GoULgr&~r2eHn6I6r*fdzY{4)i(oV#D_0NB z)EDtD68u(Ml6c2B%dGu^%A_hDS7fd3nY7dhe1(MWQ4`~xPY-fVzvvmQ;u$ zWhH4_k+ls$5@+aR{!gx9-AVdZJZBis8`OF)yaC0U^l1ZYPt6h9Yt)j`+Mmjy=G^a= zr{c%WUs)vg40s9#biM*d)ZR7L^EVJlJxzNI`kRT1650!*6zDO4V6a_cWpF%g=O8| z@ua6^QgbzL4z{76JTy{Y55K+cKlfQ_Xdk))!j+%T0+{2AF2h+O}Z9qK`H$;h8aTLuVOyGZxO$fy!nX1f?zt9~;ZgD_Br8uVAZs zOhr_min8U2^aUq!#iky|50`!=T+n0B@9|Pdu(*8Sy$h=?uyDzdfF4XGfr9q-aot>U zTre@W*lYn499|XyMSv++zPF1Q~{SX_7@&``O2$Dv?o^)XTevP9H#_@^LOh=se zUX^Oi9!=nS(3gvfcyAf72%N!C30PFr)a!zag#pX|q-EW{h~a>q7dix7*oHqDxNznd zO?EC}9xf6V*f?{G{Kou>v@QXIE}mjJ74b>PIB@FFsjpf7L4~{p(3mW*gblhh!QD{N zGhM*J^9!x%)d@gG3XowZ=n+^vj(fP~3IjOtEfeUTwU+@N3D@wJ*?8rur{dRR_)QA~ zoPDpQ+yUk<(abS?`0bod!B%EMq4j;TPm{u_DE1S2X&)V;3H_DwG@YZ1SQPwsx~7){ zv!AdWUntkAF0~Pph#ytED{}Td8ZI*O+6Or4?4gkcJ^I#}UVDYHb&{ z-XqIplssM7`ZtOEI@thY)axr`Oy1zb>G?*feuF%MY*hT~NW_ zxFl2r^4&7t#s9QiJFs~ISbkm2BIv&x@PWtt9}qae;y^@Y03-f<0Fcy&caOJnOBB4R zgt#))fW;i>R+zXSsBVcewR4|*qZF+~oA3SN{8!Ffx#K|DGSN^8y+ohdRvpj<%66ep zy@Lus5Wg?MH7~hvWKIzWyApxx%xW+LuLy&RR>8S&gum8)AhMa3a)@}+as{!HFu(KQ zm|?MR;h69t3|g-L|LNlE)vekC)gHLBdqCA~>fO|hHs;r*xSPvs?c1^UVdstMCahAk z;9xJu*Y`rW#WX#3*~cYsxHe&xnhn1#wwv}R0IB$O{FM-o&{MZou{!>1UwX7U{_BNP Lu2t|;H~#+@|08FZ diff --git a/.gitignore b/.gitignore index fd9dd52..5a4bdc2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # python files *.pyc __pycache__ +*.DS_Store # Content storysaver diff --git a/dedupe_phash.py b/dedupe_scripts/dedupe_phash.py similarity index 100% rename from dedupe_phash.py rename to dedupe_scripts/dedupe_phash.py diff --git a/dupecleaner_phash.py b/dedupe_scripts/dupecleaner_phash.py similarity index 100% rename from dupecleaner_phash.py rename to dedupe_scripts/dupecleaner_phash.py diff --git a/find_by_phash.py b/dedupe_scripts/find_by_phash.py similarity index 100% rename from find_by_phash.py rename to dedupe_scripts/find_by_phash.py diff --git a/find_duplicates_by_phash.py b/dedupe_scripts/find_duplicates_by_phash.py similarity index 100% rename from find_duplicates_by_phash.py rename to dedupe_scripts/find_duplicates_by_phash.py diff --git a/find_duplicates_by_phash_videos.py b/dedupe_scripts/find_duplicates_by_phash_videos.py similarity index 100% rename from find_duplicates_by_phash_videos.py rename to dedupe_scripts/find_duplicates_by_phash_videos.py diff --git a/image_dupe_cleaner.py b/dedupe_scripts/image_dupe_cleaner.py similarity index 100% rename from image_dupe_cleaner.py rename to dedupe_scripts/image_dupe_cleaner.py diff --git a/dedupe_scripts/snappy_duplicates.py b/dedupe_scripts/snappy_duplicates.py new file mode 100644 index 0000000..0b7138c --- /dev/null +++ b/dedupe_scripts/snappy_duplicates.py @@ -0,0 +1,109 @@ +import os, config, funcs, cv2, imagehash +from PIL import Image + +directory = "old_snapchats" +duplicate_dir = 'dupelicate_snaps' + + +def generate_video_phash(filepath): + try: + cap = cv2.VideoCapture(filepath) + ret, frame = cap.read() + cap.release() + if not ret: + return None + phash = imagehash.phash(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))) + return str(phash) + except: + return None + +def get_snapchat_files(): + stories = funcs.get_files(directory) + stories = [get_media_data(filepath) for filepath in stories] + stories = [story for story in stories if story] + return stories + +def get_media_data(filepath): + filename = os.path.basename(filepath) + parts = filename.split('~') + if len(parts) < 3: + return False + + username = parts[0] + timestamp = parts[1] + snap_id = parts[2] + snap_id = os.path.splitext(snap_id)[0] + + # data = {'username': username, 'timestamp': timestamp, 'filepath': filepath, 'snap_id': snap_id, 'original_snap_id': None} + data = {'username': username, 'timestamp': timestamp, 'filepath': filepath, 'snap_id': None, 'original_snap_id': snap_id} + + return data + +def process_snap_ids(filenames): + snap_ids = [] + for filename in filenames: + snap_id = filename.split('~')[2] + snap_id = os.path.splitext(snap_id)[0] + if snap_id not in snap_ids: + snap_ids.append(snap_id) + + return snap_ids + +def find_duplicate_snap(existing_snaps, current_snap): + filepath = current_snap['filepath'] + original_snap_id = current_snap['original_snap_id'] + username = current_snap['username'] + + snap_hash = funcs.calculate_file_hash(current_snap['filepath']) + if filepath.endswith('.mp4'): + phash = generate_video_phash(current_snap['filepath']) + elif filepath.endswith('.jpg'): + phash = funcs.generate_phash(current_snap['filepath']) + + for snap in existing_snaps: + if username != snap[2]: + continue + + if original_snap_id in snap[1]: + return snap + if original_snap_id == snap[5]: + return snap + if snap_hash == snap[3]: + return snap + if phash == snap[4]: + return snap + + return False + +if __name__ == '__main__': + print('Starting snappy...') + + db, cursor = config.gen_connection() + obj_storage = config.get_storage() + + stories_from_files = get_snapchat_files() + + # this script will check if there are any duplicates in old_snapchats folder in the database in table media where platform = 'snapchat' + cursor.execute("SELECT id, filename, username, hash, phash, original_snap_id FROM media WHERE filename IS NOT NULL AND platform = 'snapchat'") + existing_medias = cursor.fetchall() + + snap_files = get_snapchat_files() + + os.makedirs(duplicate_dir, exist_ok=True) + + for story in snap_files: + print(f"Processing {story['username']}...") + snap_id = story['snap_id'] + original_snap_id = story['original_snap_id'] + username = story['username'] + + # check if the snap_id is already in the database + existing_snap = find_duplicate_snap(existing_medias, story) + + if existing_snap: + print(f"Snap {original_snap_id} already exists in the database.") + new_filename = os.path.basename(story['filepath']) + new_filepath = os.path.join(duplicate_dir, new_filename) + os.rename(story['filepath'], new_filepath) + + print("Processing completed.") \ No newline at end of file diff --git a/funcs.py b/funcs.py index 728a6fc..fb63241 100644 --- a/funcs.py +++ b/funcs.py @@ -29,9 +29,6 @@ def get_files(directory): files.append(os.path.join(root, filename)) return files -import cv2 -import numpy as np - def compare_images(image_path1, image_path2): # Load the images in grayscale img1 = cv2.imread(image_path1, cv2.IMREAD_GRAYSCALE) diff --git a/organize_tiktoks.py b/old/organize_tiktoks.py similarity index 100% rename from organize_tiktoks.py rename to old/organize_tiktoks.py diff --git a/snappy.py b/old/snappy.py similarity index 87% rename from snappy.py rename to old/snappy.py index 1a2afd6..d70186c 100644 --- a/snappy.py +++ b/old/snappy.py @@ -52,6 +52,29 @@ def get_file_extension(url): else: print(f"Unknown content type for media {url}") return None + +def extract_file_type(url): + file_types = { + '400': '.jpg', + '1322': '.mp4', + '1325': '.mp4', + '1034': '.mp4', + '1023': '.jpg' + } + + base_url = url.split("?")[0] # Remove query string + + snap_data = base_url.split('/')[-1] + + # Extract the file type number + data_parts = snap_data.split('.') + if len(data_parts) > 1: + file_type_number = data_parts[1] + if file_type_number in file_types: + return file_types[file_type_number] + else: + print(f"Unexpected URL format: {base_url}") + return None def download_media(url, filepath): if os.path.exists(filepath): @@ -112,9 +135,10 @@ def main(): # Determine file extension using HEAD request. # TODO: find a better way to determine file extension without downloading the file. - extension = get_file_extension(url) + extension = extract_file_type(url) if not extension: continue + filename = f"{username}~{timestamp}~{snap_id}{extension}" filepath = os.path.join(directory, filename) diff --git a/update_snap_id.py b/old/update_snap_id.py similarity index 100% rename from update_snap_id.py rename to old/update_snap_id.py diff --git a/snapchat.py b/snapchat.py index da2981d..c1167e0 100644 --- a/snapchat.py +++ b/snapchat.py @@ -92,4 +92,4 @@ def get_highlight_stories(data): story = parse_story(snap) stories.append(story) - return stories + return stories \ No newline at end of file diff --git a/snappy_master.py b/snappy_master.py index 81cea1e..0977274 100644 --- a/snappy_master.py +++ b/snappy_master.py @@ -21,7 +21,6 @@ def archive_data(data, username): data_filepath = os.path.join(data_directory, data_filename) with open(data_filepath, 'w') as f: f.write(json.dumps(data)) - print(f"Archived data for {username} at {data_filepath}") def get_file_extension(url): response = requests.head(url) @@ -64,7 +63,7 @@ def extract_file_type(url): def download_media(url, filepath): if os.path.exists(filepath): - print(f"File {filepath} already exists. Skipping download.") + # File already exists, skip download and return the filepath as if it was downloaded. return filepath response = requests.get(url) @@ -76,55 +75,6 @@ def download_media(url, filepath): f.write(response.content) return filepath -def get_all_stories(usernames): - snapchat_users_data = get_all_users_data(usernames) - - all_stories = [] - for username in usernames: - print(f"Getting stories for {username}...") - data = snapchat_users_data.get(username) - if not data: - print(f"Failed to get data for {username}. Skipping.") - continue - - archive_data(data, username) - - print("Getting stories...") - stories = get_stories(data) - - print("Getting highlights...") - stories.extend(get_highlight_stories(data)) - - for story in stories: - snap_id = story['snap_id'] - url = story['url'] - timestamp = story['timestamp'] - - # Determine file extension using HEAD request. - extension = extract_file_type(url) - if not extension: - print(f"Failed to determine file extension for {url}. Skipping.") - continue - - filename = f"{username}~{timestamp}~{snap_id}{extension}" - filepath = os.path.join(directory, filename) - - media = { - 'username': username, - 'timestamp': timestamp, - 'filepath': filepath, - 'snap_id': snap_id, - 'original_snap_id': story['original_snap_id'], - 'media_url': url, - } - - all_stories.append(media) - print(f"Media {snap_id} ready for download.") - - all_stories.extend(stories) - - return all_stories - def get_snapchat_stories(): os.makedirs(directory, exist_ok=True) os.makedirs(data_directory, exist_ok=True) @@ -149,10 +99,8 @@ def get_snapchat_stories(): archive_data(data, username) - print("Getting stories...") stories = get_stories(data) - print("Getting highlights...") stories.extend(get_highlight_stories(data)) for story in stories: @@ -162,7 +110,7 @@ def get_snapchat_stories(): duplicate_snap = find_duplicate_snap(existing_medias, snap_id, username) if duplicate_snap: - print(f"Media {snap_id} already exists. Skipping download.") + # Snap already exists in the database continue # Determine file extension using HEAD request. @@ -191,11 +139,25 @@ def get_snapchat_stories(): return ready_stories +def get_snapchat_files(): + stories = funcs.get_files(directory) + stories = [get_media_data(filepath) for filepath in stories] + stories = [story for story in stories if story] + return stories + +def main(): + ready_stories = get_snapchat_stories() + stories_from_files = get_snapchat_files() + + ready_stories.extend(stories_from_files) + + download_stories(ready_stories) + def download_stories(stories): for story in stories: # Download the media filepath = story['filepath'] - url = story['media_url'] if 'media_url' in story else None + url = story['media_url'] filename = os.path.basename(filepath) timestamp = story['timestamp'] @@ -209,17 +171,6 @@ def download_stories(stories): UploadMedia(story) -def main(): - ready_stories = get_snapchat_stories() - - stories_from_files = funcs.get_files(directory) - stories_from_files = [get_media_data(filepath) for filepath in stories_from_files] - stories_from_files = [story for story in stories_from_files if story] - - ready_stories.extend(stories_from_files) - - download_stories(ready_stories) - def UploadMedia(media): username = media['username'] timestamp = media['timestamp'] @@ -288,7 +239,8 @@ def get_media_data(filepath): snap_id = parts[2] snap_id = os.path.splitext(snap_id)[0] - data = {'username': username, 'timestamp': timestamp, 'filepath': filepath, 'snap_id': snap_id, 'original_snap_id': None} + data = {'username': username, 'timestamp': timestamp, 'filepath': filepath, 'snap_id': snap_id, 'original_snap_id': None, 'media_url': None} + # data = {'username': username, 'timestamp': timestamp, 'filepath': filepath, 'snap_id': None, 'original_snap_id': snap_id, 'media_url': None} return data diff --git a/storysave_dump_tiktok.py b/storysave_dump_tiktok.py index 5310642..f4ddf10 100644 --- a/storysave_dump_tiktok.py +++ b/storysave_dump_tiktok.py @@ -120,7 +120,7 @@ def dump_instagram(folder_path): if __name__ == '__main__': print('Starting processing...') - + if not os.listdir(directory): print('No files to process. Exiting...') exit() diff --git a/storysave_dump_tiktok_process.py b/storysave_dump_tiktok_process.py index 6d08495..59452b1 100644 --- a/storysave_dump_tiktok_process.py +++ b/storysave_dump_tiktok_process.py @@ -10,10 +10,6 @@ def is_valid_uuid(uuid_to_test, version=4): return str(uuid_obj) == uuid_to_test -# file name : masstik_caammmyyy_1310_655_going blonde wednesdayyyy.mp4 -# file name : masstiktok_aleksandraverse__#fyp #trending #viral #foryou.mp4 -# where the first item is prefix, second is username and after those is the tiktok title - source_dir = 'tiktoks/' processed_dir = 'processed_tiktoks' diff --git a/storysave_dump_unknown.py b/storysave_dump_unknown.py index 9eaa8eb..5d0031e 100644 --- a/storysave_dump_unknown.py +++ b/storysave_dump_unknown.py @@ -2,7 +2,7 @@ from datetime import datetime import os, config, funcs, cv2 from uuid import uuid4 -directory = 'ready_to_upload' +directory = 'ready_for_upload/instagram' def UploadMedia(username, user_id, filepath): thumbnail_url = None @@ -80,8 +80,12 @@ def get_user_id(username): def get_media(folder_path): medias = [] - for user_folder in os.listdir(folder_path): - files = os.listdir(os.path.join(folder_path, user_folder)) + user_folders = os.listdir(folder_path) + for user_folder in user_folders: + user_folder_path = os.path.join(folder_path, user_folder) + if not os.path.isdir(user_folder_path): + continue + files = os.listdir(user_folder_path) for filename in files: filepath = os.path.join(folder_path, user_folder, filename) media = {