From e2bfc7769bbaa605dafd267798c7b1c64857755e Mon Sep 17 00:00:00 2001 From: ZuoZuo <68836346+Mrz-sakura@users.noreply.github.com> Date: Tue, 7 Apr 2026 19:32:58 +0800 Subject: [PATCH] deploy fix bug --- ops-scripts/remote_deploy.sh | 167 ++++++++++++++++++++++++++++++++ ops-scripts/tencent_operator.py | 39 ++++++++ 2 files changed, 206 insertions(+) create mode 100644 ops-scripts/remote_deploy.sh diff --git a/ops-scripts/remote_deploy.sh b/ops-scripts/remote_deploy.sh new file mode 100644 index 0000000..d19d25b --- /dev/null +++ b/ops-scripts/remote_deploy.sh @@ -0,0 +1,167 @@ +#!/usr/bin/env bash + +set -Eeuo pipefail + +SERVICE="" +RELEASE_ID="" +PACKAGE_URL="" +SHA256_VALUE="" +HEALTH_URL="" +UNIT_NAME="" +DEPLOY_ROOT="" +READY_TIMEOUT_SECONDS="${READY_TIMEOUT_SECONDS:-180}" +DOWNLOAD_TIMEOUT_SECONDS="${DOWNLOAD_TIMEOUT_SECONDS:-600}" +POLL_INTERVAL_SECONDS="${POLL_INTERVAL_SECONDS:-2}" + +usage() { + cat <<'EOF' +Usage: + deploy.sh \ + --service \ + --release-id \ + --package-url \ + --sha256 \ + --health-url \ + --unit-name \ + --deploy-root +EOF +} + +log() { + printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S %z')" "$*" +} + +require_cmd() { + command -v "$1" >/dev/null 2>&1 || { + echo "missing command: $1" >&2 + exit 1 + } +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --service) SERVICE="$2"; shift 2 ;; + --release-id) RELEASE_ID="$2"; shift 2 ;; + --package-url) PACKAGE_URL="$2"; shift 2 ;; + --sha256) SHA256_VALUE="$2"; shift 2 ;; + --health-url) HEALTH_URL="$2"; shift 2 ;; + --unit-name) UNIT_NAME="$2"; shift 2 ;; + --deploy-root) DEPLOY_ROOT="$2"; shift 2 ;; + --ready-timeout-seconds) READY_TIMEOUT_SECONDS="$2"; shift 2 ;; + --download-timeout-seconds) DOWNLOAD_TIMEOUT_SECONDS="$2"; shift 2 ;; + --poll-interval-seconds) POLL_INTERVAL_SECONDS="$2"; shift 2 ;; + -h|--help) usage; exit 0 ;; + *) echo "unknown arg: $1" >&2; usage; exit 1 ;; + esac +done + +[[ -n "$SERVICE" && -n "$RELEASE_ID" && -n "$PACKAGE_URL" && -n "$SHA256_VALUE" && -n "$HEALTH_URL" && -n "$UNIT_NAME" && -n "$DEPLOY_ROOT" ]] || { + usage + exit 1 +} + +require_cmd curl +require_cmd sha256sum +require_cmd tar +require_cmd systemctl + +release_dir="${DEPLOY_ROOT}/releases/${RELEASE_ID}" +current_link="${DEPLOY_ROOT}/current" +staging_root="/tmp/chatapp-deploy/${SERVICE}/${RELEASE_ID}" +staging_package="${staging_root}/${SERVICE}.tgz" +staging_extract="${staging_root}/extract" +previous_target="" + +mkdir -p "${DEPLOY_ROOT}/releases" "${staging_root}" "${staging_extract}" +if [[ -L "${current_link}" ]]; then + previous_target="$(readlink -f "${current_link}" || true)" +fi + +cleanup() { + rm -rf "${staging_root}" +} +trap cleanup EXIT + +rollback() { + local rollback_reason="$1" + log "rollback: ${rollback_reason}" + if [[ -n "${previous_target}" && -d "${previous_target}" ]]; then + ln -sfn "${previous_target}" "${current_link}.tmp" + mv -Tf "${current_link}.tmp" "${current_link}" + /usr/bin/systemctl reset-failed "${UNIT_NAME}" >/dev/null 2>&1 || true + /usr/bin/systemctl start "${UNIT_NAME}" || true + if ! wait_ready "rollback"; then + log "rollback target is still not ready" + fi + fi +} + +wait_ready() { + local phase="$1" + local deadline=$(( $(date +%s) + READY_TIMEOUT_SECONDS )) + + while [[ "$(date +%s)" -lt "${deadline}" ]]; do + if curl -fsS --max-time 3 "${HEALTH_URL}" >/dev/null; then + log "${phase}: ready check passed" + return 0 + fi + sleep "${POLL_INTERVAL_SECONDS}" + done + + return 1 +} + +stop_service() { + if /usr/bin/systemctl is-active --quiet "${UNIT_NAME}"; then + log "stopping ${UNIT_NAME} with SIGTERM" + # The unit files already set KillSignal=SIGTERM, so stop triggers graceful shutdown. + /usr/bin/systemctl stop "${UNIT_NAME}" + else + log "${UNIT_NAME} already inactive" + fi +} + +start_service() { + log "starting ${UNIT_NAME}" + /usr/bin/systemctl reset-failed "${UNIT_NAME}" >/dev/null 2>&1 || true + /usr/bin/systemctl start "${UNIT_NAME}" +} + +log "download package ${PACKAGE_URL}" +curl -fL --connect-timeout 10 --max-time "${DOWNLOAD_TIMEOUT_SECONDS}" -o "${staging_package}" "${PACKAGE_URL}" +echo "${SHA256_VALUE} ${staging_package}" | sha256sum -c - + +if [[ ! -d "${release_dir}" ]]; then + mkdir -p "${release_dir}" + tar -C "${staging_extract}" -xzf "${staging_package}" + cp -R "${staging_extract}/." "${release_dir}/" +else + log "release dir already exists, reuse ${release_dir}" +fi + +if [[ ! -x "${release_dir}/bin/${SERVICE}" ]]; then + echo "release binary missing: ${release_dir}/bin/${SERVICE}" >&2 + exit 1 +fi + +ln -sfn "${release_dir}" "${current_link}.tmp" +mv -Tf "${current_link}.tmp" "${current_link}" + +if ! stop_service; then + rollback "systemd stop failed" + echo "systemd stop failed: ${UNIT_NAME}" >&2 + exit 1 +fi +if ! start_service; then + rollback "systemd start failed" + echo "systemd start failed: ${UNIT_NAME}" >&2 + exit 1 +fi + +if ! wait_ready "deploy"; then + rollback "service failed to become ready after deploy" + echo "service failed to become ready: ${UNIT_NAME}" >&2 + exit 1 +fi + +log "deploy success: ${SERVICE}@${RELEASE_ID}" diff --git a/ops-scripts/tencent_operator.py b/ops-scripts/tencent_operator.py index 40fd30e..b626bfa 100755 --- a/ops-scripts/tencent_operator.py +++ b/ops-scripts/tencent_operator.py @@ -59,6 +59,17 @@ def should_retry_without_output_cos(exc: TencentCloudSDKException) -> bool: return code == "ResourceNotFound.RoleNotFound" and "TAT_QCSLinkedRoleInUploadInvocation" in message +def local_remote_deploy_script() -> str: + template_path = Path(__file__).with_name("remote_deploy.sh") + try: + content = template_path.read_text(encoding="utf-8") + except FileNotFoundError as exc: + raise RuntimeError(f"missing remote deploy template: {template_path}") from exc + if not content.strip(): + raise RuntimeError(f"remote deploy template is empty: {template_path}") + return content.rstrip() + "\n" + + class CloudOperator: def __init__(self, config: dict[str, Any], service_name: str, instance_id: str, release_id: str | None) -> None: self.config = config @@ -105,6 +116,7 @@ class CloudOperator: def deploy(self) -> None: self.ensure_agent_online() + self.ensure_remote_deploy_script_installed() self.ensure_systemd_unit_installed() package_url, sha256 = self.resolve_release_package() @@ -215,6 +227,33 @@ class CloudOperator: task = self.wait_for_tat(invocation_id) self.ensure_task_success(task, f"ensure systemd unit {unit_name}") + def ensure_remote_deploy_script_installed(self) -> None: + script_path = str(self.config["tat"]["script_path"]) + script_dir = str(Path(script_path).parent) + script_content_b64 = base64.b64encode(local_remote_deploy_script().encode("utf-8")).decode("ascii") + command = textwrap.dedent( + f"""\ + set -Eeuo pipefail + script_path={shlex.quote(script_path)} + script_dir={shlex.quote(script_dir)} + tmp_file="$(mktemp)" + trap 'rm -f "$tmp_file"' EXIT + mkdir -p "$script_dir" + TMP_FILE="$tmp_file" python3 - <<'PY' + import base64 + import os + from pathlib import Path + Path(os.environ["TMP_FILE"]).write_bytes(base64.b64decode("{script_content_b64}")) + PY + if [[ ! -f "$script_path" ]] || ! cmp -s "$tmp_file" "$script_path"; then + install -m 0755 "$tmp_file" "$script_path" + fi + """ + ) + invocation_id = self.run_tat_command(command, f"ensure-deploy-script-{self.service_name}") + task = self.wait_for_tat(invocation_id) + self.ensure_task_success(task, f"ensure deploy script {script_path}") + def systemd_unit_content(self) -> str: deploy_root = str(self.service_cfg["deploy_root"]).rstrip("/") binary_path = f"{deploy_root}/current/bin/{self.service_name}"