This page documents both the HTCondor configuration (i.e. changes to the default RPM) as well as any Linux kernel parameter tuning required, divided into three sections: configs/tunings on the central manager, configs/tunings required on the submit nodes, and configs/tunings on the execute machines.
StartD
UPDATE_INTERVAL=$RANDOM_INTEGER(540, 740, 1)
Central Manager
The central managers are running on HA mode, test-012.t2.ucsd.edu and e143.chtc.wisc.edu. They have 128GB of memory and 32cores. they have collectors trees with 200 child collectors.
Central Manager Configuration:
- System Configurations:
net.core.rmem_max = 10000000 (via sysctl) renice -5 main Collector Process
- 00_gwms_general.config
CONDOR_HOST=$(FULL_HOSTNAME) UID_DOMAIN=$(FULL_HOSTNAME) FILESYSTEM_DOMAIN=$(FULL_HOSTNAME) LOCK = $(LOG) DAEMON_LIST = MASTER SEC_DAEMON_SESSION_DURATION = 50000 SEC_DEFAULT_AUTHENTICATION = REQUIRED SEC_DEFAULT_AUTHENTICATION_METHODS = FS,GSI SEC_READ_AUTHENTICATION = OPTIONAL SEC_CLIENT_AUTHENTICATION = OPTIONAL DENY_WRITE = anonymous@* DENY_ADMINISTRATOR = anonymous@* DENY_DAEMON = anonymous@* DENY_NEGOTIATOR = anonymous@* DENY_CLIENT = anonymous@* SEC_DEFAULT_ENCRYPTION = OPTIONAL SEC_DEFAULT_INTEGRITY = REQUIRED SEC_READ_INTEGRITY = OPTIONAL SEC_CLIENT_INTEGRITY = OPTIONAL SEC_READ_ENCRYPTION = OPTIONAL SEC_CLIENT_ENCRYPTION = OPTIONAL HOSTALLOW_WRITE = * ALLOW_WRITE = $(HOSTALLOW_WRITE)
- 01_gwms_collectors.config
COLLECTOR_NAME = frontend_service COLLECTOR_HOST = $(CONDOR_HOST) COLLECTOR.USE_VOMS_ATTRIBUTES = False COLLECTOR_MAX_FILE_DESCRIPTORS=80000 SCHEDD_MAX_FILE_DESCRIPTORS = 80000 SHARED_PORT_MAX_FILE_DESCRIPTORS = 80000 DAEMON_LIST = $(DAEMON_LIST), COLLECTOR, NEGOTIATOR NEGOTIATOR_POST_JOB_RANK = MY.LastHeardFrom NEGOTIATOR_INTERVAL = 60 NEGOTIATOR_MAX_TIME_PER_SUBMITTER=60 NEGOTIATOR_MAX_TIME_PER_PIESPIN=20 PREEMPTION_REQUIREMENTS = False NEGOTIATOR_INFORM_STARTD = False NEGOTIATOR.USE_VOMS_ATTRIBUTES = False NEGOTIATOR_CONSIDER_PREEMPTION = False CONDOR_VIEW_HOST = $(COLLECTOR_HOST)
- 03_gwms_local.config
GSI_DAEMON_TRUSTED_CA_DIR= /etc/grid-security/certificates GSI_DAEMON_CERT = /etc/grid-security/hostcert.pem GSI_DAEMON_KEY = /etc/grid-security/hostkey.pem CERTIFICATE_MAPFILE= /etc/condor/certs/condor_mapfile
- 10_high_availability.config
CONDOR_HOST= CENTRAL_MANAGER1 = test-012.t2.ucsd.edu CENTRAL_MANAGER2 = e143.chtc.wisc.edu COLLECTOR_HOST = $(CENTRAL_MANAGER1),$(CENTRAL_MANAGER2) HAD_PORT = 51450 HAD_ARGS = -p $(HAD_PORT) REPLICATION_PORT = 41450 REPLICATION_ARGS = -p $(REPLICATION_PORT) REPLICATION_LIST = \ $(CENTRAL_MANAGER1):$(REPLICATION_PORT), \ $(CENTRAL_MANAGER2):$(REPLICATION_PORT) HAD_LIST = \ $(CENTRAL_MANAGER1):$(HAD_PORT), \ $(CENTRAL_MANAGER2):$(HAD_PORT) HAD_CONNECTION_TIMEOUT = 5 HAD_USE_PRIMARY = true HAD = $(SBIN)/condor_had REPLICATION = $(SBIN)/condor_replication TRANSFERER = $(SBIN)/condor_transferd DAEMON_LIST = $(DAEMON_LIST), HAD, REPLICATION HAD_USE_REPLICATION = true STATE_FILE = $(SPOOL)/Accountantnew.log REPLICATION_INTERVAL = 300 MAX_TRANSFERER_LIFETIME = 300 HAD_UPDATE_INTERVAL = 300 MASTER_NEGOTIATOR_CONTROLLER = HAD MASTER_HAD_BACKOFF_CONSTANT = 360 MAX_HAD_LOG = 640000 HAD_DEBUG = D_COMMAND HAD_LOG = $(LOG)/HADLog MAX_REPLICATION_LOG = 640000 REPLICATION_DEBUG = D_COMMAND REPLICATION_LOG = $(LOG)/ReplicationLog MAX_TRANSFERER_LOG = 640000 TRANSFERER_DEBUG = D_COMMAND TRANSFERER_LOG = $(LOG)/TransferLog HOSTALLOW_ADMINISTRATOR = $(COLLECTOR_HOST) HOSTALLOW_NEGOTIATOR = $(COLLECTOR_HOST)
- 11_gwms_secondary_collectors.config
COLLECTOR9620 = $(COLLECTOR) COLLECTOR9620_ENVIRONMENT = "_CONDOR_COLLECTOR_LOG=$(LOG)/Collector9620Log" COLLECTOR9620_ARGS = -f -p 9620 DAEMON_LIST=$(DAEMON_LIST), COLLECTOR9620 # Similar entry through port 9819
- 90_gwms_dns.config
# Add certs to GSI_DAEMON_NAME with these subjects: # /DC=com/DC=DigiCert-Grid/O=Open Science Grid/OU=Services/CN=... # test-010.t2.ucsd.edu, test-008.t2.ucsd.edu, test-003.t2.ucsd.edu, # pilot01/test-008.t2.ucsd.edu, gtesterpilot01/test-001.t2.ucsd.edu, # test-009.t2.ucsd.edu, test-004.t2.ucsd.edu, test-005.t2.ucsd.edu, # test-006.t2.ucsd.edu, test-007.t2.ucsd.edu, cmssrv240.fnal.gov, # test-002.t2.ucsd.edu, cmssrv241.fnal.gov
- 96_scale_tweaks.config
COLLECTOR_QUERY_WORKERS=16 CONDOR_VIEW_HOST = 127.0.0.1 MAX_FILE_DESCRIPTORS=10512 NEGOTIATOR_MAX_TIME_PER_SUBMITTER=600 NEGOTIATOR_MAX_TIME_PER_PIESPIN=500 NEGOTIATOR_TRIM_SHUTDOWN_THRESHOLD=300 NEGOTIATOR_MAX_TIME_PER_CYCLE=1500 CLASSAD_LIFETIME=2200
Submit Machines
- System Configurations:
kernel.pid_max=131072 /proc/sys/net/core/somaxconn to 1024
- 00_gwms_general.config
CONDOR_HOST=$(FULL_HOSTNAME) UID_DOMAIN=$(FULL_HOSTNAME) FILESYSTEM_DOMAIN=$(FULL_HOSTNAME) LOCK = $(LOG) DAEMON_LIST = MASTER SEC_DAEMON_SESSION_DURATION = 50000 SEC_DEFAULT_AUTHENTICATION = REQUIRED SEC_DEFAULT_AUTHENTICATION_METHODS = FS,GSI SEC_READ_AUTHENTICATION = OPTIONAL SEC_CLIENT_AUTHENTICATION = OPTIONAL DENY_WRITE = anonymous@* DENY_ADMINISTRATOR = anonymous@* DENY_DAEMON = anonymous@* DENY_NEGOTIATOR = anonymous@* DENY_CLIENT = anonymous@* SEC_DEFAULT_ENCRYPTION = OPTIONAL SEC_DEFAULT_INTEGRITY = REQUIRED SEC_READ_INTEGRITY = OPTIONAL SEC_CLIENT_INTEGRITY = OPTIONAL SEC_READ_ENCRYPTION = OPTIONAL SEC_CLIENT_ENCRYPTION = OPTIONAL HOSTALLOW_WRITE = * ALLOW_WRITE = $(HOSTALLOW_WRITE)
- 02_gwms_schedds.config
DAEMON_LIST = $(DAEMON_LIST), SCHEDD MAX_JOBS_RUNNING = 30000 JOB_START_DELAY = 2 JOB_START_COUNT = 50 JOB_STOP_DELAY = 1 JOB_STOP_COUNT = 30 MAX_CONCURRENT_UPLOADS = 100 MAX_CONCURRENT_DOWNLOADS = 100 APPEND_REQ_VANILLA = (Memory>=1) && (Disk>=1) JOB_DEFAULT_REQUESTMEMORY=1024 JOB_DEFAULT_REQUESTDISK=1 MAXJOBRETIREMENTTIME = $(HOUR) * 24 * 7 SEC_ENABLE_MATCH_PASSWORD_AUTHENTICATION = TRUE SCHEDD_SEND_VACATE_VIA_TCP = True STARTD_SENDS_ALIVES = True ENABLE_USERLOG_FSYNC = False SHADOW.GLEXEC_STARTER = True SHADOW.GLEXEC = /bin/false MAX_SHADOW_LOG = 100000000 SPOOL_DIR_STRING="$(SPOOL)" LOCAL_SCHEDD_DIR=/var/lib/condor SCHEDD_EXPRS = $(SCHEDD_EXPRS) SPOOL_DIR_STRING RESERVED_SWAP = 0 SHADOW.USE_SHARED_PORT = True SCHEDD.USE_SHARED_PORT = True SHARED_PORT_MAX_WORKERS = 1000 SHARED_PORT_ARGS = -p 9615 DAEMON_LIST = $(DAEMON_LIST), SHARED_PORT JOB_Site = "$$(GLIDEIN_Site:Unknown)" JOB_GLIDEIN_Entry_Name = "$$(GLIDEIN_Entry_Name:Unknown)" JOB_GLIDEIN_Name = "$$(GLIDEIN_Name:Unknown)" JOB_GLIDEIN_Factory = "$$(GLIDEIN_Factory:Unknown)" JOB_GLIDEIN_Schedd = "$$(GLIDEIN_Schedd:Unknown)" JOB_GLIDEIN_ClusterId = "$$(GLIDEIN_ClusterId:Unknown)" JOB_GLIDEIN_ProcId = "$$(GLIDEIN_ProcId:Unknown)" JOB_GLIDEIN_Site = "$$(GLIDEIN_Site:Unknown)" JOB_GLIDEIN_SiteWMS = "$$(GLIDEIN_SiteWMS:Unknown)" JOB_GLIDEIN_SiteWMS_Slot = "$$(GLIDEIN_SiteWMS_Slot:Unknown)" JOB_GLIDEIN_SiteWMS_JobId = "$$(GLIDEIN_SiteWMS_JobId:Unknown)" JOB_GLIDEIN_SiteWMS_Queue = "$$(GLIDEIN_SiteWMS_Queue:Unknown)" JOB_GLIDEIN_ATTRS = JOB_Site JOB_GLIDEIN_Entry_Name JOB_GLIDEIN_Name JOB_GLIDEIN_Factory JOB_GLIDEIN_Schedd JOB_GLIDEIN_ClusterId JOB_GLIDEIN_ProcId JOB_GLIDEIN_Site JOB_GLIDEIN_SITEWMS_ATTRS = JOB_GLIDEIN_SiteWMS JOB_GLIDEIN_SiteWMS_Slot JOB_GLIDEIN_SiteWMS_JobId JOB_GLIDEIN_SiteWMS_Queue JobAdInformationAttrs = $(JOB_GLIDEIN_ATTRS), $(JOB_GLIDEIN_SITEWMS_ATTRS) SUBMIT_EXPRS = $(SUBMIT_EXPRS) $(JOB_GLIDEIN_ATTRS) $(JOB_GLIDEIN_SITEWMS_ATTRS) JobAdInformationAttrs
- 03_gwms_local.config
GSI_DAEMON_TRUSTED_CA_DIR= /etc/grid-security/certificates GSI_DAEMON_CERT = /etc/grid-security/hostcert.pem GSI_DAEMON_KEY = /etc/grid-security/hostkey.pem CERTIFICATE_MAPFILE= /etc/condor/certs/condor_mapfile
- 10_high_availability.config
CONDOR_HOST = CENTRAL_MANAGER1 = test-012.t2.ucsd.edu CENTRAL_MANAGER2 = e143.chtc.wisc.edu COLLECTOR_HOST = $(CENTRAL_MANAGER1),$(CENTRAL_MANAGER2) HOSTALLOW_ADMINISTRATOR = $(COLLECTOR_HOST) HOSTALLOW_NEGOTIATOR = $(COLLECTOR_HOST)
- 95_scalability_tweaks.config
MAX_DEFAULT_LOG = 100 Mb SHADOW_WORKLIFE = 24 * $(HOUR) SOCKET_LISTEN_BACKLOG = 1024 SHADOW=$(SBIN)/condor_shadow_s CONDOR_Q_USE_V3_PROTOCOL = False
These are the submit machines:
- e141.chtc.wisc.edu
- e142.chtc.wisc.edu
- test-003.t2.ucsd.edu
- test-010.t2.ucsd.edu
Execute Machines