Skip to content
Snippets Groups Projects
Commit 24f0c4ce authored by Sergey Yakubov's avatar Sergey Yakubov
Browse files

Merge pull request #58 in ASAPO/asapo from feature_virtualized-deployment to develop

* commit '31324b4e':
  use infiniband ip for receiver data server
  fix network error code
  advertise ip for receiver, update config files
  fix thread unsafe calls in io
  start fluentd after elasticsearch
  change fluentd config
  increase timeout for get_next, exit on any error, update deploy scripts
  add user vars file, variables for n of receivers and brokers
  increase timout for tests
  fix
  flag to use ip over ib, flag to set lightweight service nodes
  more work on deployment
  started deploying on Maxwell
parents 830af518 31324b4e
No related branches found
No related tags found
No related merge requests found
Showing
with 230 additions and 24 deletions
......@@ -51,6 +51,7 @@ auto const kBadFileNumber = IOErrorTemplate {
auto const kResourceTemporarilyUnavailable = IOErrorTemplate {
"Resource temporarily unavailable", IOErrorType::kResourceTemporarilyUnavailable
};
auto const kPermissionDenied = IOErrorTemplate {
"Permission denied", IOErrorType::kPermissionDenied
};
......
......@@ -236,14 +236,15 @@ Error MongoDBClient::InsertAsSubset(const FileInfo& file,
if (err) {
return err;
}
auto query = BCON_NEW ("$and","[","{","_id", BCON_INT64(subset_id),"}","{","images._id","{","$ne",BCON_INT64(file.id),"}","}","]");
auto query = BCON_NEW ("$and", "[", "{", "_id", BCON_INT64(subset_id), "}", "{", "images._id", "{", "$ne",
BCON_INT64(file.id), "}", "}", "]");
auto update = BCON_NEW ("$setOnInsert", "{",
"size", BCON_INT64 (subset_size),
"}",
"$addToSet", "{",
"images", BCON_DOCUMENT(document.get()), "}");
err = AddBsonDocumentToArray(query, update,ignore_duplicates);
err = AddBsonDocumentToArray(query, update, ignore_duplicates);
bson_destroy (query);
bson_destroy (update);
......
#include <request/request_pool.h>
#include <request/request_pool.h>
#include "request/request_pool.h"
namespace asapo {
......
......@@ -290,10 +290,13 @@ std::unique_ptr<sockaddr_in> SystemIO::BuildSockaddrIn(const std::string& addres
std::string host;
uint16_t port = 0;
std::tie(host, port) = *hostname_port_tuple;
host = ResolveHostnameToIp(host, err);
if (*err != nullptr) {
return nullptr;
}
// this is not thread safe call we should not resolve hostname here - we actually already have ip in address.
// todo: remove this
// host = ResolveHostnameToIp(host, err);
// if (*err != nullptr) {
// return nullptr;
// }
short family = AddressFamilyToPosixFamily(AddressFamilies::INET);
if (family == -1) {
......
......@@ -35,6 +35,8 @@ Error GetLastErrorFromErrno() {
return IOErrorTemplates::kBadFileNumber.Generate();
case EAGAIN:
return IOErrorTemplates::kResourceTemporarilyUnavailable.Generate();
case ENETUNREACH:
return IOErrorTemplates::kAddressNotValid.Generate();
case ENOENT:
case ENOTDIR:
return IOErrorTemplates::kFileNotFound.Generate();
......
......@@ -16,7 +16,7 @@ RUN add-apt-repository \
$(lsb_release -cs) \
stable"
RUN apt-get update && apt-get install -y docker-ce-cli wget unzip iproute2
RUN apt-get update && apt-get install -y docker-ce-cli wget unzip iproute2 vim
ENV CONSUL_VERSION=1.6.0
......@@ -50,6 +50,8 @@ RUN cd /var/run/asapo asapo && terraform init
COPY asapo-* /usr/bin/
COPY *.sh asapo_overwrite_vars.tfvars /tmp/asapo_runscripts/
COPY *.py /etc/asapo/
COPY *.hcl.tpl /etc/asapo/
......
......@@ -6,4 +6,8 @@ if [ ! -f /var/nomad/token ]; then
cp /var/nomad/token $TF_VAR_service_dir/nomad_token
fi
cd /var/run/asapo && terraform apply -auto-approve "$@"
\ No newline at end of file
if [ -f /var/run/asapo/user_vars.tfvars ]; then
USER_VAR_FILE="-var-file=/var/run/asapo/user_vars.tfvars"
fi
cd /var/run/asapo && terraform apply -auto-approve $USER_VAR_FILE "$@"
\ No newline at end of file
elk_logs = true
receiver_total_memory_size = 35000
receiver_dataserver_cache_size = 30 #gb
grafana_total_memory_size = 2000
influxdb_total_memory_size = 2000
fluentd_total_memory_size = 1000
elasticsearch_total_memory_size = 3000
kibana_total_memory_size = 1000
mongo_total_memory_size = 20000
authorizer_total_memory_size = 512
discovery_total_memory_size = 512
n_receivers = 1
n_brokers = 1
\ No newline at end of file
......@@ -22,8 +22,8 @@ node_meta = {
ib_address = "$ib_address"
}
server = $is_server
bootstrap_expect = $n_servers
server = $is_server
$bootstrap_expect_string
rejoin_after_leave = true
retry_join = $server_adresses
......
#!/usr/bin/env bash
#SBATCH --nodes=1
#SBATCH -t 00:40:00
srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 ./run_maxwell.sh
......@@ -9,8 +9,8 @@ acl {
}
server {
enabled = $is_server
bootstrap_expect = $n_servers
enabled = $is_server
$bootstrap_expect_string
}
data_dir = "/var/nomad"
......@@ -18,7 +18,24 @@ data_dir = "/var/nomad"
client {
enabled = true
alloc_dir="$nomad_alloc_dir"
meta {
"asapo_service" = $is_asapo_lightweight_service_node
"ib_address" = "$ib_address"
}
}
plugin "docker" {
config {
endpoint = "$docker_endpoint"
tls {
cert = "/etc/nomad/cert.pem"
key = "/etc/nomad/key.pem"
ca = "/etc/nomad/ca.pem"
}
allow_privileged = true
}
}
......@@ -5,9 +5,9 @@ import socket
import json
import os
def is_server(ip,server_names):
def in_server_list(ip,server_names, check_single=False):
servers = json.loads(server_names)
if len(servers) == 1:
if len(servers) == 1 and check_single == False:
return "true"
for server in json.loads(server_names):
try:
......@@ -33,13 +33,21 @@ def set_parameters():
except:
print ("cannot define own ip")
my_ip = "127.0.0.1"
d['docker_endpoint']=my_get_env('DOCKER_ENDPOINT',"unix:///var/run/docker.sock")
d['advertise_ip']=my_get_env('ADVERTISE_IP',my_ip)
d['n_servers']=my_get_env('N_SERVERS',1)
d['server_adresses']=my_get_env('SERVER_ADRESSES','["'+socket.gethostname()+'"]')
d['is_server']=is_server(d['advertise_ip'],d['server_adresses'])
d['is_server']=in_server_list(d['advertise_ip'],d['server_adresses'])
if d['is_server']=="true":
d['bootstrap_expect_string'] = "bootstrap_expect = "+ str(d['n_servers'])
else:
d['bootstrap_expect_string'] = ""
d['ib_address']=my_get_env('IB_ADDRESS',"none")
d['nomad_alloc_dir']=my_get_env('NOMAD_ALLOC_DIR','')
d['recursors']=my_get_env('RECURSORS','["8.8.8.8"]')
lightweight_service_nodes=my_get_env('ASAPO_LIGHTWEIGHT_SERVICE_NODES','[]')
d['is_asapo_lightweight_service_node']=in_server_list(d['advertise_ip'],lightweight_service_nodes, True)
return d
def process_file(file_in,file_out):
......
#!/usr/bin/env bash
#folders
NOMAD_ALLOC_HOST_SHARED=/tmp/asapo/container_host_shared/nomad_alloc
SERVICE_DATA_CLUSTER_SHARED=/home/yakubov/asapo/asapo_cluster_shared/service_data
DATA_GLOBAL_SHARED=/home/yakubov/asapo/global_shared/data
#service distribution
MAX_NOMAD_SERVERS=3 # rest are clients
N_ASAPO_LIGHTWEIGHT_SERVICE_NODES=1 # where to put influx, elk, ... . Rest are receivers, brokers, mongodb
#DESY stuff
RECURSORS=["\"131.169.40.200\"",\""131.169.194.200\""]
ASAPO_USER=`id -u`:`id -g`
ASAPO_VAR_FILE=`pwd`/asapo_overwrite_vars.tfvars
# use ib interface for service discovery (all communications goes thourgh this interface)
# todo: use ib only for communications with receiver (asapo discovery service should return correct ip using node meta IB_ADDRESS)
USE_IP_OVER_IB=true
#docker stuff
DOCKER_ENDPOINT="127.0.0.1:2376" #comment to use unix sockets
DOCKER_TLS_CA=/data/netapp/docker/certs/ca.pem
DOCKER_TLS_KEY=/data/netapp/docker/certs/$USER/key.pem
DOCKER_TLS_CERT=/data/netapp/docker/certs/$USER/cert.pem
IB_HOSTNAME=`hostname --short`-ib
IB_ADDRESS=`getent hosts $IB_HOSTNAME | awk '{ print $1 }'`
if [ "$USE_IP_OVER_IB" == "true" ]; then
ADVERTISE_IP=$IB_ADDRESS
HOSTNAME_SUFFIX=-ib
fi
#prepare env variables based on the above input
N_SERVERS=$(( $SLURM_JOB_NUM_NODES > $MAX_NOMAD_SERVERS ? $MAX_NOMAD_SERVERS : $SLURM_JOB_NUM_NODES ))
SERVER_ADRESSES=`scontrol show hostnames $SLURM_JOB_NODELIST | head -$N_SERVERS | awk -v suf=$HOSTNAME_SUFFIX 'BEGIN{printf "["} {printf "%s\"%s%s\"",sep,$0,suf; sep=","} END{print "]"}'`
ASAPO_LIGHTWEIGHT_SERVICE_NODES=`scontrol show hostnames $SLURM_JOB_NODELIST | head -$N_ASAPO_LIGHTWEIGHT_SERVICE_NODES | awk -v suf=$HOSTNAME_SUFFIX 'BEGIN{printf "["} {printf "%s\"%s%s\"",sep,$0,suf; sep=","} END{print "]"}'`
# make folders if not exist
mkdir -p $NOMAD_ALLOC_HOST_SHARED $SERVICE_DATA_CLUSTER_SHARED $DATA_GLOBAL_SHARED
chmod 777 $NOMAD_ALLOC_HOST_SHARED $SERVICE_DATA_CLUSTER_SHARED $DATA_GLOBAL_SHARED
cd $SERVICE_DATA_CLUSTER_SHARED
mkdir esdatadir fluentd grafana influxdb mongodb
chmod 777 *
#todo: elastic search check
mmc=`cat /proc/sys/vm/max_map_count`
if (( mmc < 262144 )); then
echo consider increasing max_map_count - needed for elasticsearch
# exit 1
fi
docker rm -f asapo
docker pull yakser/asapo-cluster
if [ -f $ASAPO_VAR_FILE ]; then
MOUNT_VAR_FILE="-v $ASAPO_VAR_FILE:/var/run/asapo/user_vars.tfvars"
fi
dockerrun --rm \
-u $ASAPO_USER \
-v /scratch/docker/100000.100000:/scratch/docker/100000.100000 \
-v $NOMAD_ALLOC_HOST_SHARED:$NOMAD_ALLOC_HOST_SHARED \
-v $SERVICE_DATA_CLUSTER_SHARED:$SERVICE_DATA_CLUSTER_SHARED \
-v $DOCKER_TLS_CA:/etc/nomad/ca.pem \
-v $DOCKER_TLS_KEY:/etc/nomad/key.pem \
-v $DOCKER_TLS_CERT:/etc/nomad/cert.pem \
-v $DATA_GLOBAL_SHARED:$DATA_GLOBAL_SHARED \
$MOUNT_VAR_FILE \
-e NOMAD_ALLOC_DIR=$NOMAD_ALLOC_HOST_SHARED \
-e TF_VAR_service_dir=$SERVICE_DATA_CLUSTER_SHARED \
-e TF_VAR_data_dir=$DATA_GLOBAL_SHARED \
-e ADVERTISE_IP=$ADVERTISE_IP \
-e RECURSORS=$RECURSORS \
-e TF_VAR_asapo_user=$ASAPO_USER \
-e IB_ADDRESS=$IB_ADDRESS \
-e SERVER_ADRESSES=$SERVER_ADRESSES \
-e ASAPO_LIGHTWEIGHT_SERVICE_NODES=$ASAPO_LIGHTWEIGHT_SERVICE_NODES \
-e DOCKER_ENDPOINT=$DOCKER_ENDPOINT \
-e N_SERVERS=$N_SERVERS \
--name asapo yakser/asapo-cluster
job "asapo-brokers" {
datacenters = ["dc1"]
affinity {
attribute = "$${meta.asapo_service}"
value = "false"
weight = 100
}
update {
max_parallel = 1
......@@ -9,8 +14,7 @@ job "asapo-brokers" {
}
group "brokers" {
count = 1
count = ${n_brokers}
restart {
attempts = 2
interval = "3m"
......@@ -23,6 +27,9 @@ job "asapo-brokers" {
user = "${asapo_user}"
config {
network_mode = "host"
privileged = true
security_opt = ["no-new-privileges"]
userns_mode = "host"
image = "yakser/asapo-broker${image_suffix}"
force_pull = true
volumes = ["local/config.json:/var/lib/broker/config.json"]
......
job "asapo-logging" {
datacenters = ["dc1"]
affinity {
attribute = "$${meta.asapo_service}"
value = "true"
weight = 100
}
# update {
# max_parallel = 1
......@@ -27,6 +33,9 @@ job "asapo-logging" {
}
config {
network_mode = "host"
privileged = true
security_opt = ["no-new-privileges"]
userns_mode = "host"
image = "yakser/fluentd_elastic"
volumes = ["local/fluentd.conf:/fluentd/etc/fluent.conf",
"/${service_dir}/fluentd:/shared"]
......@@ -95,6 +104,9 @@ job "asapo-logging" {
nproc = "8192"
}
network_mode = "host"
privileged = true
security_opt = ["no-new-privileges"]
userns_mode = "host"
image = "yakser/elasticsearch:${elasticsearch_version}"
volumes = ["/${service_dir}/esdatadir:/usr/share/elasticsearch/data"]
}
......@@ -131,6 +143,9 @@ job "asapo-logging" {
user = "${asapo_user}"
config {
network_mode = "host"
privileged = true
security_opt = ["no-new-privileges"]
userns_mode = "host"
image = "yakser/kibana:${kibana_version}"
volumes = ["local/kibana.yml:/usr/share/kibana/config/kibana.yml"]
}
......
job "asapo-mongo" {
datacenters = ["dc1"]
affinity {
attribute = "$${meta.asapo_service}"
value = "false"
weight = 100
}
update {
max_parallel = 1
min_healthy_time = "10s"
......@@ -24,6 +28,9 @@ job "asapo-mongo" {
config {
network_mode = "host"
privileged = true
security_opt = ["no-new-privileges"]
userns_mode = "host"
image = "mongo:${mongo_version}"
volumes = ["/${service_dir}/mongodb:/data/db"]
}
......
......@@ -38,6 +38,9 @@ job "asapo-nginx" {
config {
network_mode = "host"
privileged = true
security_opt = ["no-new-privileges"]
userns_mode = "host"
image = "nginx:${nginx_version}"
volumes = [
"local/nginx.conf:/etc/nginx/nginx.conf"
......
job "asapo-perfmetrics" {
datacenters = ["dc1"]
affinity {
attribute = "$${meta.asapo_service}"
value = "true"
weight = 100
}
# update {
# max_parallel = 1
......@@ -22,6 +27,9 @@ job "asapo-perfmetrics" {
user = "${asapo_user}"
config {
network_mode = "host"
privileged = true
security_opt = ["no-new-privileges"]
userns_mode = "host"
image = "influxdb:${influxdb_version}"
volumes = ["/${service_dir}/influxdb:/var/lib/influxdb"]
}
......@@ -69,6 +77,9 @@ job "asapo-perfmetrics" {
config {
network_mode = "host"
privileged = true
security_opt = ["no-new-privileges"]
userns_mode = "host"
image = "grafana/grafana:${grafana_version}"
volumes = ["/${service_dir}/grafana:/var/lib/grafana"]
}
......
job "asapo-receivers" {
datacenters = ["dc1"]
affinity {
attribute = "$${meta.asapo_service}"
value = "false"
weight = 100
}
update {
max_parallel = 1
......@@ -9,7 +14,7 @@ job "asapo-receivers" {
}
group "receivers" {
count = 1
count = ${n_receivers}
restart {
attempts = 2
......@@ -23,7 +28,9 @@ job "asapo-receivers" {
user = "${asapo_user}"
config {
network_mode = "host"
dns_servers = ["127.0.0.1"]
privileged = true
security_opt = ["no-new-privileges"]
userns_mode = "host"
image = "yakser/asapo-receiver${image_suffix}"
force_pull = true
volumes = ["local/config.json:/var/lib/receiver/config.json",
......
job "asapo-services" {
datacenters = ["dc1"]
affinity {
attribute = "$${meta.asapo_service}"
value = "true"
weight = 100
}
type = "service"
......@@ -11,7 +16,9 @@ job "asapo-services" {
user = "${asapo_user}"
config {
network_mode = "host"
dns_servers = ["127.0.0.1"]
privileged = true
security_opt = ["no-new-privileges"]
userns_mode = "host"
image = "yakser/asapo-authorizer${image_suffix}"
force_pull = true
volumes = ["local/config.json:/var/lib/authorizer/config.json"]
......@@ -74,7 +81,9 @@ job "asapo-services" {
user = "${asapo_user}"
config {
network_mode = "host"
dns_servers = ["127.0.0.1"]
privileged = true
security_opt = ["no-new-privileges"]
userns_mode = "host"
image = "yakser/asapo-discovery${image_suffix}"
force_pull = true
volumes = ["local/config.json:/var/lib/discovery/config.json"]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment