From 622c27e19a281b344804be038ac2f45d5c9933ae Mon Sep 17 00:00:00 2001
From: Blake Fitch <blake.fitch@tuebingen.mpg.de>
Date: Sun, 30 Jun 2024 13:13:30 +0200
Subject: [PATCH] enable removing old confirmed_put files (objects) when
 skip_on_zero_len and replace_existing

---
 pymods/irods_utils/irods_utils.py | 49 ++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 17 deletions(-)

diff --git a/pymods/irods_utils/irods_utils.py b/pymods/irods_utils/irods_utils.py
index 9fc32f8..a86b158 100644
--- a/pymods/irods_utils/irods_utils.py
+++ b/pymods/irods_utils/irods_utils.py
@@ -350,12 +350,13 @@ def streaming_transfer_file_to_object( args ):
     return rc, hash_digest256, hash_digest512, transfer_size
 
 # Returns None or the data_object irods ref.
-def confirmed_put( irods_sesh, file_pathname, new_obj_ipath, metadata_dict=None, datatype=None, block_size=(2**28), replace_existing=None ):
+def confirmed_put( irods_sesh, file_pathname, new_obj_ipath, metadata_dict=None, datatype=None, block_size=(2**28), replace_existing=False, skip_on_zero_len=False ):
     logging.debug( "file_pathname: "  + file_pathname
                  + " new_obj_ipath: " + new_obj_ipath
                  + " datatype: "      + datatype
                  + " block_size: "    + str( block_size )
-                 + " replace_existing " + str( replace_existing) )
+                 + " replace_existing " + str( replace_existing),
+                 + " skip_on_zero_len " + str( skip_on_zero_len ) )
 
     # NOTE: this routine uploades the file as a temporary iRODS object.
     # Common sense suggests checking if the objct already exsists, a rare condition first.
@@ -368,6 +369,34 @@ def confirmed_put( irods_sesh, file_pathname, new_obj_ipath, metadata_dict=None,
         logging.error( "irods_sesh == None" )
         return None
 
+    # Figure out if this path in the archive is occupied with pre-existing data.
+    existing_obj = None
+    try:
+        options = {kw.VERIFY_CHKSUM_KW: ''}
+        existing_obj = irods_sesh.data_objects.get( new_obj_ipath, **options )
+        logging.debug( "Object aleady exists at ipath: " + new_obj_ipath )
+    except irods_ex.DataObjectDoesNotExist:
+        logging.debug( "DataObjectDoesNotExist " + new_obj_ipath )
+    except irods_ex.OBJ_PATH_DOES_NOT_EXIST:
+        logging.debug( "OBJ_PATH_DOES_NOT_EXIST " + new_obj_ipath )
+    except Exception as ex:
+        logging.error("Failed using data_onject.get() (but not DataObjectDoesNotExist) ipath: " + new_obj_ipath + " ex: " + str(ex) + " type " + str(type(ex)) )
+        return None
+
+    try:
+        upload_file_size = os.path.getsize( file_pathname )
+    except Exception as ex:
+        logging.error( f"FATAL: Failed to get size of file to upload. file_pathname: {file_pathname} ex: {ex}" )
+        raise
+
+    if upload_file_size == 0 and existing_obj != None and replace_existing and skip_on_zero_len :
+        logging.warning( f"got zero len file with replace_existing -- removing current data object. ipath: {new_object_ipath}" )
+        try:
+            existing_obj.unlink(force=True)
+        except Exception as ex:
+            logging.warning( "Failed remove zero line file on replace_existing and skip_on_zero_len. ipath: {new_obj_ipath}  ex: {ex} " )
+            raise
+
     # Make a timestamped partial tmp file name which, if the upload somehow fails here, will be left behind
     # Hopefully this does not happen much, but we have seen at least on case so far.
     # Cleanup will need to be done elsewhere.
@@ -428,20 +457,6 @@ def confirmed_put( irods_sesh, file_pathname, new_obj_ipath, metadata_dict=None,
         logging.error("metadata AVU dict >" + str( metadata_dict ) + "<" )
         return None
 
-    existing_obj = None
-
-    try:
-        options = {kw.VERIFY_CHKSUM_KW: ''}
-        existing_obj = irods_sesh.data_objects.get( new_obj_ipath, **options )
-        logging.debug( "Object aleady exists at ipath: " + new_obj_ipath )
-    except irods_ex.DataObjectDoesNotExist:
-        logging.debug( "DataObjectDoesNotExist " + new_obj_ipath )
-    except irods_ex.OBJ_PATH_DOES_NOT_EXIST:
-        logging.debug( "OBJ_PATH_DOES_NOT_EXIST " + new_obj_ipath )
-    except Exception as ex:
-        logging.error("Failed using data_onject.get() (but not DataObjectDoesNotExist) ipath: " + new_obj_ipath + " ex: " + str(ex) + " type " + str(type(ex)) )
-        return None
-
     mismatch = False
 
     if existing_obj != None:
@@ -512,7 +527,7 @@ def confirmed_put( irods_sesh, file_pathname, new_obj_ipath, metadata_dict=None,
             dup_obj.unlink(force=True)
         except Exception as ex:
             logging.warning( "Failed remove dup when remove_existing flag set. ipath: {dup_ipath}  ex: {ex} " )
-          
+
     new_obj = None
     try:
         new_obj = irods_sesh.data_objects.get( new_obj_ipath )
-- 
GitLab