Skip to content
Snippets Groups Projects
Commit adc79dbd authored by Eric Cano's avatar Eric Cano
Browse files

#252: Added abort partial prepare test in archive retrieve test.

parent 7ee74286
No related branches found
No related tags found
No related merge requests found
......@@ -96,6 +96,7 @@ RUN yum-config-manager --enable epel --setopt="epel.priority=4" \
jsoncpp \
libmicrohttpd \
jq \
python36 \
&& \
yum clean all \
&& \
......
......@@ -7,7 +7,7 @@ if [ ! -e /etc/buildtreeRunner ]; then
yum-config-manager --enable ceph
# Install missing RPMs
yum -y install cta-cli cta-debuginfo xrootd-client eos-client jq
yum -y install cta-cli cta-debuginfo xrootd-client eos-client jq python36
## Keep this temporary fix that may be needed if going to protobuf3-3.5.1 for CTA
# Install eos-protobuf3 separately as eos is OK with protobuf3 but cannot use it..
......
......@@ -54,6 +54,7 @@ echo " Archiving ${NB_FILES} files of ${FILE_SIZE_KB}kB each"
echo " Archiving files: xrdcp as user1"
echo " Retrieving them as poweruser1"
kubectl -n ${NAMESPACE} cp client_ar.sh client:/root/client_ar.sh
kubectl -n ${NAMESPACE} cp client_ar_abortPrepare.py client:/root/client_abortPrepare.sh
kubectl -n ${NAMESPACE} exec client -- bash /root/client_ar.sh -n ${NB_FILES} -s ${FILE_SIZE_KB} -p 100 -d /eos/ctaeos/preprod -v -r || exit 1
kubectl -n ${NAMESPACE} exec ctaeos -- bash /root/grep_xrdlog_mgm_for_error.sh || exit 1
......
......@@ -140,8 +140,11 @@ mkdir -p ${LOGDIR} || die "Cannot create directory LOGDIR: ${LOGDIR}"
mkdir -p ${LOGDIR}/xrd_errors || die "Cannot create directory LOGDIR/xrd_errors: ${LOGDIR}/xrd_errors"
STATUS_FILE=$(mktemp)
echo "$(date +%s): STATUS_FILE=${STATUS_FILE}"
ERROR_FILE=$(mktemp)
echo "$(date +%s): ERROR_FILE=${ERROR_FILE}"
EOS_BATCHFILE=$(mktemp --suffix=.eosh)
echo "$(date +%s): EOS_BATCHFILE=${EOS_BATCHFILE}"
dd if=/dev/urandom of=/tmp/testfile bs=1k count=${FILE_KB_SIZE} || exit 1
......@@ -181,15 +184,16 @@ eos root://${EOSINSTANCE} mkdir -p ${EOS_DIR} || die "Cannot create directory ${
# Create directory for xrootd error reports
ERROR_DIR="/dev/shm/$(basename ${EOS_DIR})"
mkdir ${ERROR_DIR}
echo "$(date +%s): ERROR_DIR=${ERROR_DIR}"
# not more than 100k files per directory so that we can rm and find as a standard user
for ((subdir=0; subdir < ${NB_DIRS}; subdir++)); do
eos root://${EOSINSTANCE} mkdir -p ${EOS_DIR}/${subdir} || die "Cannot create directory ${EOS_DIR}/{subdir} in eos instance ${EOSINSTANCE}."
echo -n "Copying files to ${EOS_DIR}/${subdir} using ${NB_PROCS} processes..."
for ((i=0;i<${NB_FILES};i++)); do
echo ${TEST_FILE_NAME_BASE}$(printf %.2d ${subdir})$(printf %.6d $i)
done | xargs --max-procs=${NB_PROCS} -iTEST_FILE_NAME bash -c "XRD_LOGLEVEL=Dump xrdcp /tmp/testfile root://${EOSINSTANCE}/${EOS_DIR}/${subdir}/TEST_FILE_NAME 2>${ERROR_DIR}/TEST_FILE_NAME && rm ${ERROR_DIR}/TEST_FILE_NAME || echo ERROR with xrootd transfer for file TEST_FILE_NAME, full logs in ${ERROR_DIR}/TEST_FILE_NAME"
#done | xargs --max-procs=${NB_PROCS} -iTEST_FILE_NAME xrdcp --silent /tmp/testfile root://${EOSINSTANCE}/${EOS_DIR}/${subdir}/TEST_FILE_NAME
# done | xargs -n ${BATCH_SIZE} --max-procs=${NB_BATCH_PROCS} ./batch_xrdcp /tmp/testfile root://${EOSINSTANCE}/${EOS_DIR}/${subdir}
done | xargs --max-procs=${NB_PROCS} -iTEST_FILE_NAME bash -c "XRD_LOGLEVEL=Dump xrdcp /tmp/testfile root://${EOSINSTANCE}/${EOS_DIR}/${subdir}/TEST_FILE_NAME 2>${ERROR_DIR}/TEST_FILE_NAME && rm ${ERROR_DIR}/TEST_FILE_NAME || echo ERROR with xrootd transfer for file TEST_FILE_NAME, full logs in ${ERROR_DIR}/TEST_FILE_NAME"
#done | xargs --max-procs=${NB_PROCS} -iTEST_FILE_NAME xrdcp --silent /tmp/testfile root://${EOSINSTANCE}/${EOS_DIR}/${subdir}/TEST_FILE_NAME
# done | xargs -n ${BATCH_SIZE} --max-procs=${NB_BATCH_PROCS} ./batch_xrdcp /tmp/testfile root://${EOSINSTANCE}/${EOS_DIR}/${subdir}
echo Done.
done
if [ "0" != "$(ls ${ERROR_DIR} 2> /dev/null | wc -l)" ]; then
......@@ -273,8 +277,8 @@ done
# cat ${STATUS_FILE} | KRB5CCNAME=/tmp/${EOSPOWER_USER}/krb5cc_0 XrdSecPROTOCOL=krb5 xargs --max-procs=${NB_PROCS} -iTEST_FILE_NAME xrdfs ${EOSINSTANCE} prepare -s ${EOS_DIR}/TEST_FILE_NAME 2>&1 | tee ${ERROR_FILE}
# CAREFULL HERE: ${STATUS_FILE} contains lines like: 99/test9900001
for ((subdir=0; subdir < ${NB_DIRS}; subdir++)); do
echo -n "Recalling files to ${EOS_DIR}/${subdir} using ${NB_PROCS} processes..."
cat ${STATUS_FILE} | grep ^${subdir}/ | cut -d/ -f2 | xargs --max-procs=${NB_PROCS} -iTEST_FILE_NAME bash -c "XRD_LOGLEVEL=Dump KRB5CCNAME=/tmp/${EOSPOWER_USER}/krb5cc_0 XrdSecPROTOCOL=krb5 xrdfs ${EOSINSTANCE} prepare -s ${EOS_DIR}/${subdir}/TEST_FILE_NAME?activity=T0Reprocess 2>${ERROR_DIR}/RETRIEVE_TEST_FILE_NAME && rm ${ERROR_DIR}/RETRIEVE_TEST_FILE_NAME || echo ERROR with xrootd transfer for file TEST_FILE_NAME, full logs in ${ERROR_DIR}/RETRIEVE_TEST_FILE_NAME" | tee ${LOGDIR}/prepare_${subdir}.log | grep ^ERROR
echo -n "Retrieving files to ${EOS_DIR}/${subdir} using ${NB_PROCS} processes..."
cat ${STATUS_FILE} | grep ^${subdir}/ | cut -d/ -f2 | xargs --max-procs=${NB_PROCS} -iTEST_FILE_NAME bash -c "XRD_LOGLEVEL=Dump KRB5CCNAME=/tmp/${EOSPOWER_USER}/krb5cc_0 XrdSecPROTOCOL=krb5 xrdfs ${EOSINSTANCE} prepare -s ${EOS_DIR}/${subdir}/TEST_FILE_NAME?activity=T0Reprocess 2>${ERROR_DIR}/RETRIEVE_TEST_FILE_NAME && rm ${ERROR_DIR}/RETRIEVE_TEST_FILE_NAME || echo ERROR with xrootd prepare stage for file TEST_FILE_NAME, full logs in ${ERROR_DIR}/RETRIEVE_TEST_FILE_NAME" | tee ${LOGDIR}/prepare_${subdir}.log | grep ^ERROR
echo Done.
cat ${STATUS_FILE} | grep ^${subdir}/ | cut -d/ -f2 | xargs --max-procs=${NB_PROCS} -iTEST_FILE_NAME bash -c "XRD_LOGLEVEL=Dump KRB5CCNAME=/tmp/${EOSPOWER_USER}/krb5cc_0 XrdSecPROTOCOL=krb5 xrdfs ${EOSINSTANCE} query opaquefile ${EOS_DIR}/${subdir}/TEST_FILE_NAME?mgm.pcmd=xattr\&mgm.subcmd=get\&mgm.xattrname=sys.retrieve.req_id 2>${ERROR_DIR}/XATTRGET_TEST_FILE_NAME && rm ${ERROR_DIR}/XATTRGET_TEST_FILE_NAME || echo ERROR with xrootd xattr get for file TEST_FILE_NAME, full logs in ${ERROR_DIR}/XATTRGET_TEST_FILE_NAME" | tee ${LOGDIR}/prepare_sys.retrieve.req_id_${subdir}.log | grep ^ERROR
done
......@@ -330,10 +334,10 @@ for ((subdir=0; subdir < ${NB_DIRS}; subdir++)); do
eos root://${EOSINSTANCE} ls -y ${EOS_DIR}/${subdir} | egrep 'd[1-9][0-9]*::t1' | sed -e "s%\s\+% %g;s%.* \([^ ]\+\)$%${subdir}/\1%" >> ${STATUS_FILE}
done
TO_STAGERRM=$(cat ${STATUS_FILE} | wc -l)
TO_EVICT=$(cat ${STATUS_FILE} | wc -l)
echo "$(date +%s): $TO_STAGERRM files to be stagerrm'ed from EOS using 'xrdfs prepare -e'"
# We need the -s as we are staging the files from tape (see xrootd prepare definition)
echo "$(date +%s): $TO_EVICT files to be evicted from EOS using 'xrdfs prepare -e'"
# We need the -e as we are evicting the files from disk cache (see xrootd prepare definition)
cat ${STATUS_FILE} | sed -e "s%^%${EOS_DIR}/%" | XrdSecPROTOCOL=krb5 KRB5CCNAME=/tmp/${EOSPOWER_USER}/krb5cc_0 xargs --max-procs=10 -n 40 xrdfs ${EOSINSTANCE} prepare -e > /dev/null
......@@ -342,29 +346,111 @@ for ((subdir=0; subdir < ${NB_DIRS}; subdir++)); do
LEFTOVER=$(( ${LEFTOVER} + $(eos root://${EOSINSTANCE} ls -y ${EOS_DIR}/${subdir} | egrep '^d[1-9][0-9]*::t1' | wc -l) ))
done
STAGERRMED=$((${TO_STAGERRM}-${LEFTOVER}))
echo "$(date +%s): $STAGERRMED files stagerrmed from EOS 'xrdfs prepare -e'"
EVICTED=$((${TO_EVICT}-${LEFTOVER}))
echo "$(date +%s): $EVICTED/$TO_EVICT files evicted from EOS 'xrdfs prepare -e'"
LASTCOUNT=${STAGERRMED}
LASTCOUNT=${EVICTED}
#echo "$(date +%s): Dumping objectstore list"
#ssh root@ctappsfrontend cta-objectstore-list
# Build the list of tape only files.
rm -f ${STATUS_FILE}
touch ${STATUS_FILE}
for ((subdir=0; subdir < ${NB_DIRS}; subdir++)); do
eos root://${EOSINSTANCE} ls -y ${EOS_DIR}/${subdir} | egrep 'd0::t[^0]' | sed -e "s%\s\+% %g;s%.* \([^ ]\+\)$%${subdir}/\1%" >> ${STATUS_FILE}
done
# Put all tape drives down
admin_kdestroy &>/dev/null
admin_kinit &>/dev/null
INITIAL_DRIVES_STATE=`admin_cta --json dr ls`
echo INITIAL_DRIVES_STATE:
echo ${INITIAL_DRIVES_STATE} | jq -r '.[] | [ .driveName, .driveStatus] | @tsv' | column -t
echo -n "Will put down those drives : "
echo ${INITIAL_DRIVES_STATE} | jq -r '.[] | select (.driveStatus == "UP") | .driveName'
for d in `echo ${INITIAL_DRIVES_STATE} | jq -r '.[] | select (.driveStatus == "UP") | .driveName'`; do
admin_cta dr down $d
done
# Prepare-stage the files
#cat ${STATUS_FILE} | perl -p -e "s|^(.*)$|${EOS_DIR}/\$1?activity=T0Reprocess|" | \
# XRD_LOGLEVEL=Dump KRB5CCNAME=/tmp/${EOSPOWER_USER}/krb5cc_0 XrdSecPROTOCOL=krb5 xargs -n 40 --max-procs=10 \
# echo bash -c "echo xrdfs ${EOSINSTANCE} prepare -s $@" bash
# | \
# tee ${LOGDIR}/prepare_${subdir}.log | grep -i error
for ((subdir=0; subdir < ${NB_DIRS}; subdir++)); do
echo -n "Retrieving files to ${EOS_DIR}/${subdir} using ${NB_PROCS} processes (prepare2)..."
cat ${STATUS_FILE} | grep ^${subdir}/ | cut -d/ -f2 | xargs --max-procs=${NB_PROCS} -iTEST_FILE_NAME bash -c "XRD_LOGLEVEL=Dump KRB5CCNAME=/tmp/${EOSPOWER_USER}/krb5cc_0 XrdSecPROTOCOL=krb5 xrdfs ${EOSINSTANCE} prepare -s ${EOS_DIR}/${subdir}/TEST_FILE_NAME?activity=T0Reprocess 2>${ERROR_DIR}/RETRIEVE_TEST_FILE_NAME && rm ${ERROR_DIR}/RETRIEVE_TEST_FILE_NAME || echo ERROR with xrootd prepare stage for file TEST_FILE_NAME, full logs in ${ERROR_DIR}/RETRIEVE_TEST_FILE_NAME" | tee ${LOGDIR}/prepare2_${subdir}.log | grep ^ERROR
echo Done.
echo -n "Checking the presence of the sys.retrieve.req_id extended attrbutes..."
cat ${STATUS_FILE} | grep ^${subdir}/ | cut -d/ -f2 | xargs --max-procs=${NB_PROCS} -iTEST_FILE_NAME bash -c "XRD_LOGLEVEL=Dump KRB5CCNAME=/tmp/${EOSPOWER_USER}/krb5cc_0 XrdSecPROTOCOL=krb5 xrdfs ${EOSINSTANCE} query opaquefile ${EOS_DIR}/${subdir}/TEST_FILE_NAME?mgm.pcmd=xattr\&mgm.subcmd=get\&mgm.xattrname=sys.retrieve.req_id 2>${ERROR_DIR}/XATTRGET_TEST_FILE_NAME && rm ${ERROR_DIR}/XATTRGET_TEST_FILE_NAME || echo ERROR with xrootd xattr get for file TEST_FILE_NAME, full logs in ${ERROR_DIR}/XATTRGET_TEST_FILE_NAME" | tee ${LOGDIR}/prepare2_sys.retrieve.req_id_${subdir}.log | grep ^ERROR
echo Done.
done
if [ "0" != "$(ls ${ERROR_DIR} 2> /dev/null | wc -l)" ]; then
# there were some prepare errors
echo "Several prepare errors occured during retrieval!"
echo "Please check client pod logs in artifacts"
mv ${ERROR_DIR}/* ${LOGDIR}/xrd_errors/
fi
# Ensure all requests files are queued
requestsTotal=`admin_cta --json sq | jq -r '.[] | select (.mountType == "RETRIEVE") | [ .queuedFiles | tonumber ] | reduce .[] as $n (0;.+$n)'`
filesCount=`wc -l ${STATUS_FILE} | cut -d " " -f 1`
if [ ${requestsTotal} -ne ${filesCount} ]; then
echo "ERROR: Retrieve queue(s) size mismatch: ${requestsTotal} requests queued for ${filesCount} files."
fi
# Abort prepare -s requests
for ((subdir=0; subdir < ${NB_DIRS}; subdir++)); do
echo -n "Cancelling prepare for files in ${EOS_DIR}/${subdir} using ${NB_PROCS} processes (prepare_abort)..."
cat ${STATUS_FILE} | grep ^${subdir}/ | cut -d/ -f2 \
| xargs --max-procs=${NB_PROCS} -iTEST_FILE_NAME /root/client_ar_abortPrepare.py --eos-instance ${EOSINSTANCE} \
--eos-poweruser ${EOSPOWER_USER} --eos-dir ${EOS_DIR} --subdir ${subdir} --file TEST_FILE_NAME --error-dir ${ERROR_DIR} \
| tee ${LOGDIR}/prepare_abort_sys.retrieve.req_id_${subdir}.log # | grep ^ERROR
echo Done.
done
# Updating all files statuses
# Please note that s/d[^0]::t[^0] now maps to 'retrieved' and not 'archived' as
# in previous status mappings
eos root://${EOSINSTANCE} ls -y ${EOS_DIR} | sed -e 's/^\(d.::t.\).*\(test[0-9]\+\)$/\2 \1/;s/d[^0]::t[^0]/retrieved/;s/d[^0]::t0/copied/;s/d0::t0/error/;s/d0::t[^0]/tapeonly/' > ${STATUS_FILE}
# Put drive(s) back up to clear the queue
echo -n "Will put back up those drives : "
echo ${INITIAL_DRIVES_STATE} | jq -r '.[] | select (.driveStatus == "UP") | .driveName'
for d in `echo ${INITIAL_DRIVES_STATE} | jq -r '.[] | select (.driveStatus == "UP") | .driveName'`; do
admin_cta dr up $d
done
# Check that queues are empty after a while and files did not get retrieved
echo "$(date +%s): Waiting for retrieve queues to be cleared:"
SECONDS_PASSED=0
WAIT_FOR_RETRIEVE_QUEUES_CLEAR_TIMEOUT=$((60))
REMAINING_REQUESTS=`admin_cta --json sq | jq -r '.[] | select (.mountType == "RETRIEVE") | [ .queuedFiles | tonumber ] | reduce .[] as $n (0;.+$n)'`
# Prevent the result from being empty
if [ -z $REMAINING_REQUESTS ]; then REMAINING_REQUESTS='0'; fi
while test ${REMAINING_REQUESTS} -gt 0; do
echo "$(date +%s): Waiting for retrieve queues to be cleared: Seconds passed = ${SECONDS_PASSED}"
echo "${REMAINING_REQUESTS} requests remaining."
sleep 1
let SECONDS_PASSED=SECONDS_PASSED+1
if test ${SECONDS_PASSED} == ${WAIT_FOR_RETRIEVE_QUEUES_CLEAR_TIMEOUT}; then
echo "$(date +%s): Timed out after ${WAIT_FOR_RETRIEVE_QUEUES_CLEAR_TIMEOUT} seconds waiting for retrieve queues to be cleared"
break
fi
REMAINING_REQUESTS=`admin_cta --json sq | jq -r '.[] | select (.mountType == "RETRIEVE") | [ .queuedFiles | tonumber ] | reduce .[] as $n (0;.+$n)'`;
# Prevent the result from being empty
if [ -z $REMAINING_REQUESTS ]; then REMAINING_REQUESTS='0'; fi
done
# Check that the files were not retrieved
# TODO
if [ "0" != "$(ls ${ERROR_DIR} 2> /dev/null | wc -l)" ]; then
# there were some prepare errors
echo "Several errors occured during prepare cancel test!"
echo "Please check client pod logs in artifacts"
mv ${ERROR_DIR}/* ${LOGDIR}/xrd_errors/
fi
# The format of the STATUS_FILE is two space separated columns per line. The
# first column is the name of the file and the second is the status of the file.
# For example:
#
# test0000 retrieved
# test0001 retrieved
# test0002 retrieved
# test0003 retrieved
# We can now delete the files
DELETED=0
if [[ $REMOVE == 1 ]]; then
echo "Waiting for files to be removed from EOS and tapes"
......
#!/usr/bin/python3.6
import argparse
import subprocess
import os
import copy
# Instantiate the parser and parse command line
parser = argparse.ArgumentParser(
description='Utility program to abort a retrieve on an EOS+CTA system.')
parser.add_argument('--eos-instance', required=True)
parser.add_argument('--eos-poweruser', required=True)
parser.add_argument('--eos-dir', required=True)
parser.add_argument('--subdir', required=True)
parser.add_argument('--file', required=True)
parser.add_argument('--error-dir', required=True)
options = parser.parse_args()
# Construct various parameters.
filepath = options.eos_dir + '/' + options.subdir + '/' + options.file
xattrgeterrorfilepath = options.error_dir + '/' + 'XATTRGET2_'
xattrgeterrorfilepath += options.subdir + '_' + options.file
aborterrorfilepath = options.error_dir + '/' + 'PREPAREABORT_'
aborterrorfilepath += options.subdir + '_' + options.file
# Get the xattr of the file
# Prepare the environment
env = copy.deepcopy(os.environ)
env['XRD_LOGLEVEL'] = 'Dump'
env['KRB5CCNAME'] = '/tmp/' + options.eos_poweruser + '/krb5cc_0'
env['XrdSecPROTOCOL'] = 'krb5'
try:
xattrRes = subprocess.run(
['xrdfs', options.eos_instance, 'query', 'opaquefile',
filepath+'?mgm.pcmd=xattr&mgm.subcmd=get&mgm.xattrname=sys.retrieve.req_id'],
env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
xattrRes.check_returncode()
except subprocess.CalledProcessError as cpe:
print('ERROR with xrdfs query for file ' + options.file + ': '+str(cpe.stderr)+' full logs in ' +
xattrgeterrorfilepath)
print(cpe.stdout)
errFile = open(xattrgeterrorfilepath, 'w')
errFile.write(str(xattrRes.stderr))
errFile.close()
except Exception as e:
print('ERROR with xrdfs query for file ' + options.file + ': got exception of type: ' +
str(type(e)) + '['.join(arg + ', ' for arg in e.args) +'] full logs in ' + xattrgeterrorfilepath)
errFile = open(xattrgeterrorfilepath, 'w')
errFile.write(str(xattrRes.stderr))
errFile.close()
# OK, worked...
requestId=xattrRes.stdout.rstrip()
print('requestId=' + requestId)
# We can now abort the prepare
try:
print('Will xrdfs ' + str(options.eos_instance).rstrip() + 'prepare -a ' + requestId + ' ' + filepath)
abortRes = subprocess.run(
['xrdfs', str(options.eos_instance).rstrip(), 'prepare', '-a', requestId, filepath],
env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
abortRes.check_returncode()
except subprocess.CalledProcessError as cpe:
print('ERROR with xrdfs prepare -a ' + options.file + '(' + str(cpe.returncode) + ') full logs in ' +
aborterrorfilepath)
print('cpe.stderr:')
print(cpe.stderr)
print('cpe.stdout')
print(cpe.stdout)
print('abortRes.stdout')
print(abortRes.stdout)
print('abortRes.stderr')
print(abortRes.stderr)
errFile = open(aborterrorfilepath, 'w')
errFile.write(str(abortRes.stderr))
errFile.close()
except Exception as e:
print('ERROR with xrdfs prepare -a for file ' + options.file + ': got exception of type: ' +
str(type(e)) + '['.join(arg + ', ' for arg in e.args) +'] full logs in '+ aborterrorfilepath)
errFile = open(aborterrorfilepath, 'w')
errFile.write(str(abortRes.stderr))
errFile.close()
print(abortRes.stdout)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment