@ -12,20 +12,20 @@ directory = 'media'
os . makedirs ( temp_directory , exist_ok = True )
media_types = {
' stories ' : ' story ' ,
' posts ' : ' post ' ,
' profile ' : ' profile '
' stories ' : ' story ' ,
' posts ' : ' post ' ,
' profile ' : ' profile '
}
for media_type , _ in media_types . items ( ) :
os . makedirs ( os . path . join ( directory , media_type ) , exist_ok = True )
existing_media_ids = { }
existing_media_ids = set ( )
UPLOAD_CUSTOM = False
CACHE_FILE = os . path . join ( temp_directory , ' existing_media_ids.json ' )
CACHE_TTL = timedelta ( hours = 48 )
def UploadMedia ( media ) :
username = media [ ' username ' ]
user_id = media [ ' user_id ' ]
@ -37,12 +37,12 @@ def UploadMedia(media):
post_type = media [ ' post_type ' ]
thumbnail_url = None
phash = None
if media_id and media_id in existing_media_ids :
print ( ' Duplicate file detected. Removing... ' )
os . remove ( filepath )
return True
file_size = os . path . getsize ( filepath )
filename = os . path . basename ( filepath )
file_extension = os . path . splitext ( filename ) [ 1 ] . lower ( )
@ -56,14 +56,16 @@ def UploadMedia(media):
print ( f ' Error determining media type for { filename } . Skipping... ' )
return False
try : post_date = datetime . fromtimestamp ( int ( timestamp ) )
except : post_date = datetime . fromtimestamp ( os . path . getctime ( filepath ) )
try :
post_date = datetime . fromtimestamp ( int ( timestamp ) )
except :
post_date = datetime . fromtimestamp ( os . path . getctime ( filepath ) )
width , height = funcs . get_media_dimensions ( filepath )
if 0 in ( width , height ) :
print ( f ' Error getting dimensions for { filename } . Skipping... ' )
return False
duration = funcs . get_video_duration ( filepath )
if media_type == ' image ' :
@ -71,7 +73,7 @@ def UploadMedia(media):
elif media_type == ' video ' :
try :
thumb_path = generate_thumbnail ( filepath )
obj_storage . PutFile ( thumb_path , f ' thumbnails/ { file_hash } .jpg ' ) # this might be a problem in case of duplicate hashes
obj_storage . PutFile ( thumb_path , f ' thumbnails/ { file_hash } .jpg ' )
thumbnail_url = f " https://cdn.altpins.com/thumbnails/ { file_hash } .jpg "
phash = funcs . generate_phash ( thumb_path )
os . remove ( thumb_path )
@ -81,18 +83,17 @@ def UploadMedia(media):
custom_filename = media_id if media_id else file_hash
newFilename = f ' { custom_filename } { file_extension } '
server_path = f ' media/ { post_type } / { username } / { newFilename } '
file_url = f " https://cdn.altpins.com/ { server_path } "
obj_storage . PutFile ( filepath , server_path )
if highlight_id :
newCursor . execute ( " INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES ( %s , %s , %s ) " , ( highlight_id , user_id , media_id ) )
newCursor . execute ( " INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES ( %s , %s , %s ) " ,
( highlight_id , user_id , media_id ) )
newDB . commit ( )
print ( f ' [ { newCursor . rowcount } ] added highlight { highlight_id } to user { user_id } ' )
query = " INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform, file_size) VALUES ( %s , %s , %s , %s , %s , %s , %s , %s , %s , %s , %s , %s , %s , %s , %s , %s ) "
values = ( username , media_type , file_url , width , height , media_id , post_type , post_date , user_id , file_hash , filename , duration , thumbnail_url , phash , platform , file_size )
@ -104,14 +105,13 @@ def UploadMedia(media):
print ( f ' File: { filename } ' )
print ( f ' URL: { file_url } ' )
print ( f ' Pin URL: https://altpins.com/pin/ { newCursor . lastrowid } ' )
print ( " = " * 100 )
print ( " = " * 100 )
os . remove ( filepath )
existing_media_ids . add ( media_id )
return newCursor . lastrowid
def generate_thumbnail ( filepath ) :
thumb_path = os . path . join ( temp_directory , f ' { uuid4 ( ) } .jpg ' )
cap = cv2 . VideoCapture ( filepath )
@ -120,16 +120,16 @@ def generate_thumbnail(filepath):
cap . release ( )
return thumb_path
def get_user_id ( username ) :
username = username . lower ( )
if username in existing_users :
return existing_users [ username ]
return None
def get_media_data ( filepath ) :
filename = os . path . basename ( filepath )
parts = filename . split ( ' ~ ' )
if len ( parts ) != 4 :
return False
@ -141,7 +141,7 @@ def get_media_data(filepath):
platform = ' instagram '
highlight_id = user_id . replace ( ' highlight ' , ' ' ) if ' highlight ' in user_id else None
if user_id . isdigit ( ) :
user_id = int ( user_id )
else :
@ -152,17 +152,17 @@ def get_media_data(filepath):
else :
media_id = None
data = { ' username ' : username , ' timestamp ' : timestamp , ' media_id ' : media_id , ' user_id ' : user_id , ' filepath ' : filepath , ' highlight_id ' : highlight_id , ' platform ' : platform }
data = { ' username ' : username , ' timestamp ' : timestamp , ' media_id ' : media_id , ' user_id ' : user_id ,
' filepath ' : filepath , ' highlight_id ' : highlight_id , ' platform ' : platform }
return data
def get_media ( ) :
medias = [ ]
failed_medias = [ ]
for media_type , post_type in media_types . items ( ) :
media_folder_path = os . path . join ( directory , media_type )
if not os . path . exists ( media_folder_path ) :
continue
@ -172,26 +172,23 @@ def get_media():
if not data :
failed_medias . append ( filepath )
continue
data [ ' post_type ' ] = post_type
medias . append ( data )
return medias , failed_medias
def get_custom_media ( failed_medias ) :
medias = [ ]
for media_type , post_type in media_types . items ( ) :
folder_path = os . path . join ( directory , media_type )
user_dirs = [ d for d in os . listdir ( folder_path ) if os . path . isdir ( os . path . join ( folder_path , d ) ) ]
for username in user_dirs :
user_folder_path = os . path . join ( folder_path , username )
for filename in os . listdir ( user_folder_path ) :
if filename . startswith ( ' . ' ) :
continue
filepath = os . path . join ( user_folder_path , filename )
if not filepath in failed_medias :
continue
@ -199,7 +196,7 @@ def get_custom_media(failed_medias):
user_id = get_user_id ( username )
timestamp = int ( os . path . getctime ( filepath ) )
media_id = os . path . splitext ( filename ) [ 0 ]
if media_id . isdigit ( ) :
media_id = int ( media_id )
if media_id < 10000000 :
@ -217,40 +214,35 @@ def get_custom_media(failed_medias):
" highlight_id " : None ,
" post_type " : post_type
}
medias . append ( data )
return medias
def save_highlight_data ( highlights ) :
filename = f ' { uuid4 ( ) } .json '
filepath = os . path . join ( ' highlight_data ' , filename )
with open ( filepath , ' w ' ) as f :
json . dump ( highlights , f )
def dump_instagram ( ) :
medias , failed_medias = get_media ( )
medias = clean_dupes ( medias )
failed_medias = get_custom_media ( failed_medias )
medias . sort ( key = lambda x : ( x [ ' username ' ] . lower ( ) , x [ ' timestamp ' ] ) )
# Update new user ids and existing user ids
new_user_ids = { }
for media in medias :
user_id = media [ ' user_id ' ]
username = media [ ' username ' ]
if not media [ ' user_id ' ] :
continue
if username in existing_users :
continue
existing_users [ username ] = user_id
new_user_ids [ username ] = user_id
# Assign user ids
for media in medias :
if media [ ' user_id ' ] :
continue
@ -262,13 +254,12 @@ def dump_instagram():
if not media [ ' highlight_id ' ] :
continue
highlights . append ( {
" media_id " : media [ " media_id " ] ,
" user_id " : media [ " user_id " ] ,
" highlight_id " : media [ ' highlight_id ' ] ,
" username " : media [ ' username ' ] ,
} )
" media_id " : media [ " media_id " ] ,
" user_id " : media [ " user_id " ] ,
" highlight_id " : media [ ' highlight_id ' ] ,
" username " : media [ ' username ' ] ,
} )
# save highlights data into folder highlight_Data
if highlights :
save_highlight_data ( highlights )
@ -280,85 +271,97 @@ def dump_instagram():
for media in failed_medias :
pinid = UploadMedia ( media )
def clean_dupes ( medias ) :
removed_count = 0
new_medias = [ ]
for media in medias :
media_id = media [ ' media_id ' ]
filepath = media [ ' filepath ' ]
if not media_id :
print ( f ' Invalid media_id for file { filepath } . Skipping... ' )
continue
if media_id in existing_media_ids :
removed_count + = 1
print ( f ' Found duplicate file { filepath } . Removing... ' )
os . remove ( filepath )
continue
if re . search ( r ' \ ( \ d+ \ ) ' , filepath ) :
removed_count + = 1
print ( f ' Found duplicate file { filepath } . Removing... ' )
os . remove ( filepath )
continue
new_medias . append ( media )
print ( f ' Removed { removed_count } duplicate files. ' )
return new_medias
# -------------------- CACHE SYSTEM --------------------
def get_cached_data ( ) :
if not os . path . exists ( CACHE_FILE ) :
print ( ' No cache file found. Generating new cache… ' )
return None , None
return None , None , None
try :
with open ( CACHE_FILE , ' r ' ) as f :
cache_data = json . load ( f )
timestamp = datetime . fromisoformat ( cache_data . get ( ' timestamp ' , ' ' ) )
if datetime . now ( ) - timestamp < CACHE_TTL :
print ( ' Using cached data… ' )
return set ( tuple ( x ) for x in cache_data . get ( ' existing_media_ids ' , [ ] ) ) , cache_data . get ( ' existing_users ' , { } )
cache = json . load ( f )
media_ids = set ( cache . get ( ' media_ids ' , [ ] ) )
users = { k . lower ( ) : v for k , v in cache . get ( ' existing_users ' , { } ) . items ( ) }
last_id = cache . get ( ' last_id ' , 0 )
return media_ids , users , last_id
except Exception as e :
print ( f " Cache read error: { e } " )
return None , None , None
return None , None
def save_cached_data ( existing_ media_ids, existing_users ) :
def save_cached_data ( media_ids, existing_users , last_id ) :
with open ( CACHE_FILE , ' w ' ) as f :
json . dump ( { ' timestamp ' : datetime . now ( ) . isoformat ( ) , ' existing_media_ids ' : list ( existing_media_ids ) , ' existing_users ' : existing_users } , f )
json . dump ( {
' timestamp ' : datetime . now ( ) . isoformat ( ) ,
' media_ids ' : list ( media_ids ) ,
' existing_users ' : existing_users ,
' last_id ' : last_id
} , f )
def get_user_ids ( cur ) :
cur . execute ( " SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL AND platform= ' instagram ' " )
rows = cur . fetchall ( )
return { user [ ' username ' ] . lower ( ) : user [ ' user_id ' ] for user in rows }
def get_existing_medias ( newCursor ) :
existing_media_ids , existing_users = get_cached_data ( )
def get_existing_media_ids ( cur ) :
cur . execute ( " SELECT id, media_id FROM media WHERE media_id IS NOT NULL AND platform= ' instagram ' AND status= ' public ' " )
rows = cur . fetchall ( )
media_ids = { row [ ' media_id ' ] for row in rows }
last_id = max ( ( row [ ' id ' ] for row in rows ) , default = 0 )
return media_ids , last_id
if existing_media_ids and existing_users :
newest_id = max ( existing_media_ids , key = lambda x : x [ 0 ] ) [ 0 ]
existing_media_ids = { image [ 1 ] for image in existing_media_ids }
def get_existing_medias ( cur ) :
media_ids , users , last_id = get_cached_data ( )
newCursor . execute ( " SELECT id, media_id FROM media WHERE media_id IS NOT NULL AND platform = ' instagram ' AND status = ' public ' AND id > %s ORDER BY id DESC " , ( newest_id , ) )
new_media_ids = { image [ 1 ] for image in newCursor . fetchall ( ) }
if not media_ids or not users :
print ( ' Cold cache → pulling full data... ' )
media_ids , last_id = get_existing_media_ids ( cur )
users = get_user_ids ( cur )
save_cached_data ( media_ids , users , last_id )
return media_ids , users
for media_id in new_media_ids :
existing_media_ids . add ( media_id )
return existing_media_ids , existing_users
print ( ' Getting existing files and users... ' )
newCursor . execute ( " SELECT id, media_id FROM media WHERE media_id IS NOT NULL AND platform = ' instagram ' AND status = ' public ' ; " )
existing_media_ids = { image for image in newCursor . fetchall ( ) }
cur . execute ( " SELECT id, media_id FROM media WHERE media_id IS NOT NULL AND platform= ' instagram ' AND status= ' public ' AND id > %s ORDER BY id ASC " , ( last_id , ) )
rows = cur . fetchall ( )
print ( ' Getting existing users... ' )
newCursor . execute ( " SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL AND platform = ' instagram ' " )
existing_users = { user [ 0 ] . lower ( ) : user [ 1 ] . lower ( ) for user in newCursor . fetchall ( ) }
for r in rows :
media_ids . add ( r [ ' media_id ' ] )
last_id = max ( last_id , r [ ' id ' ] )
cache_file = os . path . join ( temp_directory , ' existing_media_ids.json ' )
with open ( cache_file , ' w ' ) as f :
json . dump ( { ' timestamp ' : datetime . now ( ) . isoformat ( ) , ' existing_media_ids ' : list ( existing_media_ids ) , ' existing_users ' : existing_users } , f )
return existing_media_ids , existing_users
if rows :
save_cached_data ( media_ids , users , last_id )
return media_ids , users
# -------------------- MAIN --------------------
if __name__ == ' __main__ ' :
print ( ' Starting processing... ' )
@ -366,16 +369,11 @@ if __name__ == '__main__':
if not funcs . get_files ( directory ) :
print ( ' No files to process. Exiting... ' )
exit ( )
newDB , newCursor = config . gen_connection ( )
newDB , newCursor = config . gen_connection ( )
obj_storage = config . get_storage ( )
existing_media_ids , existing_users = get_existing_medias ( newCursor )
dump_instagram ( )
print ( " Processing completed. " )
# for mediatype, _ in media_types.items():
# funcs.clean_empty_folders(os.path.join(directory, mediatype))
print ( " Processing completed. " )