As described in #4490, CMS needs a wide-area glidein pool with 200,000 cores. We have a test setup that simulates a large pool to see how we can scale to that level. We use only a fraction of the normal hardware for execute nodes by running many startds per core and running sleeps jobs that don't consume CPU or memory. This page documents both the HTCondor configuration (i.e. changes to the default RPM) as well as any Linux kernel parameter tuning required, divided into three sections: configs/tunings on the central manager, configs/tunings required on the submit nodes, and configs/tunings on the execute machines. {subsection: StartD} {verbatim} UPDATE_INTERVAL=$RANDOM_INTEGER(540, 740, 1) {endverbatim} {subsection: Central Manager} The central managers are running on HA mode, test-012.t2.ucsd.edu and e143.chtc.wisc.edu. They have 128GB of memory and 32cores. they have collectors trees with 200 child collectors. Central Manager Configuration: *: System Configurations: {verbatim} net.core.rmem_max = 10000000 (via sysctl) renice -5 main Collector Process {endverbatim} *: 00_gwms_general.config {verbatim} CONDOR_HOST=$(FULL_HOSTNAME) UID_DOMAIN=$(FULL_HOSTNAME) FILESYSTEM_DOMAIN=$(FULL_HOSTNAME) LOCK = $(LOG) DAEMON_LIST = MASTER SEC_DAEMON_SESSION_DURATION = 50000 SEC_DEFAULT_AUTHENTICATION = REQUIRED SEC_DEFAULT_AUTHENTICATION_METHODS = FS,GSI SEC_READ_AUTHENTICATION = OPTIONAL SEC_CLIENT_AUTHENTICATION = OPTIONAL DENY_WRITE = anonymous@* DENY_ADMINISTRATOR = anonymous@* DENY_DAEMON = anonymous@* DENY_NEGOTIATOR = anonymous@* DENY_CLIENT = anonymous@* SEC_DEFAULT_ENCRYPTION = OPTIONAL SEC_DEFAULT_INTEGRITY = REQUIRED SEC_READ_INTEGRITY = OPTIONAL SEC_CLIENT_INTEGRITY = OPTIONAL SEC_READ_ENCRYPTION = OPTIONAL SEC_CLIENT_ENCRYPTION = OPTIONAL HOSTALLOW_WRITE = * ALLOW_WRITE = $(HOSTALLOW_WRITE) {endverbatim} *: 01_gwms_collectors.config {verbatim} COLLECTOR_NAME = frontend_service COLLECTOR_HOST = $(CONDOR_HOST) COLLECTOR.USE_VOMS_ATTRIBUTES = False COLLECTOR_MAX_FILE_DESCRIPTORS=80000 SCHEDD_MAX_FILE_DESCRIPTORS = 80000 SHARED_PORT_MAX_FILE_DESCRIPTORS = 80000 DAEMON_LIST = $(DAEMON_LIST), COLLECTOR, NEGOTIATOR NEGOTIATOR_POST_JOB_RANK = MY.LastHeardFrom NEGOTIATOR_INTERVAL = 60 NEGOTIATOR_MAX_TIME_PER_SUBMITTER=60 NEGOTIATOR_MAX_TIME_PER_PIESPIN=20 PREEMPTION_REQUIREMENTS = False NEGOTIATOR_INFORM_STARTD = False NEGOTIATOR.USE_VOMS_ATTRIBUTES = False NEGOTIATOR_CONSIDER_PREEMPTION = False CONDOR_VIEW_HOST = $(COLLECTOR_HOST) {endverbatim} *: 03_gwms_local.config {verbatim} GSI_DAEMON_TRUSTED_CA_DIR= /etc/grid-security/certificates GSI_DAEMON_CERT = /etc/grid-security/hostcert.pem GSI_DAEMON_KEY = /etc/grid-security/hostkey.pem CERTIFICATE_MAPFILE= /etc/condor/certs/condor_mapfile {endverbatim} *: 10_high_availability.config {verbatim} CONDOR_HOST= CENTRAL_MANAGER1 = test-012.t2.ucsd.edu CENTRAL_MANAGER2 = e143.chtc.wisc.edu COLLECTOR_HOST = $(CENTRAL_MANAGER1),$(CENTRAL_MANAGER2) HAD_PORT = 51450 HAD_ARGS = -p $(HAD_PORT) REPLICATION_PORT = 41450 REPLICATION_ARGS = -p $(REPLICATION_PORT) REPLICATION_LIST = \ $(CENTRAL_MANAGER1):$(REPLICATION_PORT), \ $(CENTRAL_MANAGER2):$(REPLICATION_PORT) HAD_LIST = \ $(CENTRAL_MANAGER1):$(HAD_PORT), \ $(CENTRAL_MANAGER2):$(HAD_PORT) HAD_CONNECTION_TIMEOUT = 5 HAD_USE_PRIMARY = true HAD = $(SBIN)/condor_had REPLICATION = $(SBIN)/condor_replication TRANSFERER = $(SBIN)/condor_transferd DAEMON_LIST = $(DAEMON_LIST), HAD, REPLICATION HAD_USE_REPLICATION = true STATE_FILE = $(SPOOL)/Accountantnew.log REPLICATION_INTERVAL = 300 MAX_TRANSFERER_LIFETIME = 300 HAD_UPDATE_INTERVAL = 300 MASTER_NEGOTIATOR_CONTROLLER = HAD MASTER_HAD_BACKOFF_CONSTANT = 360 MAX_HAD_LOG = 640000 HAD_DEBUG = D_COMMAND HAD_LOG = $(LOG)/HADLog MAX_REPLICATION_LOG = 640000 REPLICATION_DEBUG = D_COMMAND REPLICATION_LOG = $(LOG)/ReplicationLog MAX_TRANSFERER_LOG = 640000 TRANSFERER_DEBUG = D_COMMAND TRANSFERER_LOG = $(LOG)/TransferLog HOSTALLOW_ADMINISTRATOR = $(COLLECTOR_HOST) HOSTALLOW_NEGOTIATOR = $(COLLECTOR_HOST) {endverbatim} *: 11_gwms_secondary_collectors.config {verbatim} COLLECTOR9620 = $(COLLECTOR) COLLECTOR9620_ENVIRONMENT = "_CONDOR_COLLECTOR_LOG=$(LOG)/Collector9620Log" COLLECTOR9620_ARGS = -f -p 9620 DAEMON_LIST=$(DAEMON_LIST), COLLECTOR9620 # Similar entry through port 9819 {endverbatim} *: 90_gwms_dns.config {verbatim} # Add certs to GSI_DAEMON_NAME with these subjects: # /DC=com/DC=DigiCert-Grid/O=Open Science Grid/OU=Services/CN=... # test-010.t2.ucsd.edu, test-008.t2.ucsd.edu, test-003.t2.ucsd.edu, # pilot01/test-008.t2.ucsd.edu, gtesterpilot01/test-001.t2.ucsd.edu, # test-009.t2.ucsd.edu, test-004.t2.ucsd.edu, test-005.t2.ucsd.edu, # test-006.t2.ucsd.edu, test-007.t2.ucsd.edu, cmssrv240.fnal.gov, # test-002.t2.ucsd.edu, cmssrv241.fnal.gov {endverbatim} *: 96_scale_tweaks.config {verbatim} COLLECTOR_QUERY_WORKERS=16 CONDOR_VIEW_HOST = 127.0.0.1 MAX_FILE_DESCRIPTORS=10512 NEGOTIATOR_MAX_TIME_PER_SUBMITTER=600 NEGOTIATOR_MAX_TIME_PER_PIESPIN=500 NEGOTIATOR_TRIM_SHUTDOWN_THRESHOLD=300 NEGOTIATOR_MAX_TIME_PER_CYCLE=1500 CLASSAD_LIFETIME=2200 {endverbatim} {subsection:Submit Machines} *: System Configurations: {verbatim} kernel.pid_max=131072 {endverbatim} These are the submit machines: *: e141.chtc.wisc.edu *: e142.chtc.wisc.edu *: test-003.t2.ucsd.edu *: test-010.t2.ucsd.edu {subsection:Execute Machines}