Skip to content

Commit

Permalink
Merge branch 'master' into xuechun-go-yaml
Browse files Browse the repository at this point in the history
  • Loading branch information
XuechunHou authored Dec 4, 2024
2 parents 6aa4435 + 5849371 commit b8c2863
Show file tree
Hide file tree
Showing 7 changed files with 118 additions and 72 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/stale.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@ jobs:
stale-pr-message: 'This PR was marked stale due to lack of activity. It will be closed in 14 days.'
close-pr-message: 'Closed as inactive. Feel free to reopen if this PR is still being worked on.'
stale-issue-message: 'This issue was marked stale due to lack of activity. It will be closed in 14 days.'
close-issue-message: 'Closed as inacive. Feel free to reopen if this issue is still relevant.'
close-issue-message: 'Closed as inactive. Feel free to reopen if this issue is still relevant.'
days-before-close: 14
8 changes: 4 additions & 4 deletions dev-docs/dev.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,14 +158,14 @@ $ tree $CONFIG_OUT
```

* Sample generated
[golden fluent bit main conf](https://github.com/GoogleCloudPlatform/ops-agent/blob/master/confgenerator/testdata/valid/linux/default_config/golden_fluent_bit_main.conf)
[golden fluent bit main conf](https://github.com/GoogleCloudPlatform/ops-agent/blob/master/confgenerator/testdata/goldens/builtin/golden/linux/fluent_bit_main.conf)
at `$CONFIG_OUT/fluent_bit_main.conf`.
* Sample generated
[golden fluent bit parser conf](https://github.com/GoogleCloudPlatform/ops-agent/blob/master/confgenerator/testdata/valid/linux/default_config/golden_fluent_bit_parser.conf)
[golden fluent bit parser conf](https://github.com/GoogleCloudPlatform/ops-agent/blob/master/confgenerator/testdata/goldens/builtin/golden/linux/fluent_bit_parser.conf)
at `$CONFIG_OUT/fluent_bit_parser.conf`.
* Sample generated
[golden otel conf](https://github.com/GoogleCloudPlatform/ops-agent/blob/master/confgenerator/testdata/valid/linux/default_config/golden_otel.conf)
at `$CONFIG_OUT/otel.conf`.
[golden otel yaml](https://github.com/GoogleCloudPlatform/ops-agent/blob/master/confgenerator/testdata/goldens/builtin/golden/linux/otel.yaml)
at `$CONFIG_OUT/otel.yaml`.

## Build and test manually on GCE VMs

Expand Down
19 changes: 19 additions & 0 deletions integration_test/gce/gce_testing.go
Original file line number Diff line number Diff line change
Expand Up @@ -1101,6 +1101,25 @@ func addFrameworkMetadata(imageSpec string, inputMetadata map[string]string) (ma
if _, ok := metadataCopy["startup-script"]; ok {
return nil, errors.New("the 'startup-script' metadata key is reserved for future use. Instead, wait for the instance to be ready and then run things with RunRemotely() or RunScriptRemotely()")
}
// TODO(b/380470389): we actually *can't* do RunRemotely() on DLVM images due to a bug.
// The workaround for the bug is to deploy a fix in-VM via startup scripts.
if strings.Contains(imageSpec, "common-gpu-debian-11-py310") {
metadataCopy["startup-script"] = fmt.Sprintf(`
#!/bin/bash
# Give time for the guest agent and jupyter stuff to finish modifying
# /etc/passwd and test_user home directory
sleep 120
HOMEDIR=/home/%[1]s
SSHFILE=$HOMEDIR/.ssh/authorized_keys
if [ ! -f "$SSHFILE" ]; then
sudo mkdir -p "$HOMEDIR/.ssh"
sudo touch "$SSHFILE"
fi
sudo chown -R %[1]s:%[1]s "$HOMEDIR"
sudo chmod 600 "$SSHFILE"`,
sshUserName,
)
}
}
return metadataCopy, nil
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
set -e
source /etc/os-release
VERSION_ID=${VERSION_ID%%.*}
MAJOR_VERSION_ID=${VERSION_ID%%.*}

verify_driver() {
# Verify NVIDIA driver:
Expand All @@ -18,24 +18,32 @@ install_cuda_from_runfile() {
# Remove existing installation before using the runfile
remove_cuda_package
remove_driver_package
# For Rocky Linux 9: when a new OS version becomes available, the default
# repo setting (/etc/yum.repos.d/rocky.repo) will automatically point to the
# new version's repo. This is problematic since the new OS is not available
# right away on GCE. Set up the matched repo to install the correct
# kernel-devel-$(uname -r)
# Not needed for RL8 since 8.10 is already the last RL8 release.
if [[ $ID == rocky && "${MAJOR_VERSION_ID}" == 9 ]]; then
cat <<EOF | sudo tee /etc/yum.repos.d/rocky-matched.repo
[appstream-matched]
name=Rocky Linux \$releasever - AppStream - Matched
baseurl=https://dl.rockylinux.org/vault/rocky/$VERSION_ID/AppStream/\$basearch/os/
gpgcheck=1
enabled=1
countme=1
metadata_expire=6h
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9
EOF
fi
sudo yum install -y kernel-devel-$(uname -r) pciutils gcc make wget yum-utils

# Installing latest version of NVIDIA CUDA and driver
# Data Center/Tesla drivers and CUDA are released on different schedules;
# normally we install the matching versions of driver and CUDA
# ($DRIVER_VERSION == $CUDA_BUNDLED_DRIVER_VERSION); due to https://github.com/NVIDIA/open-gpu-kernel-modules/issues/550
# we install a newer version of the driver
local DRIVER_VERSION=535.129.03
local CUDA_VERSION=12.2.2
local CUDA_BUNDLED_DRIVER_VERSION=535.104.05
echo "Installing NVIDIA Data Center driver $DRIVER_VERSION"
curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent
# Install the CUDA toolkit only, so that the CUDA toolkit uses the Data Center driver installed in the previous step
# See https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/ for CUDA and driver compatibility
local CUDA_VERSION=12.6.3
local CUDA_BUNDLED_DRIVER_VERSION=560.35.05
echo "Installing CUDA Toolkit $CUDA_VERSION from CUDA installer with bundled driver $CUDA_BUNDLED_DRIVER_VERSION"
curl -fSsl -O https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION/local_installers/cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run
sudo sh cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run --toolkit --silent
sudo sh cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run --silent
verify_driver
}

Expand All @@ -44,15 +52,15 @@ setup_repo() {
# Ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#prepare-rhel-9-rocky-9
sudo yum install -y yum-utils epel-release
sudo yum-config-manager \
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel$VERSION_ID/x86_64/cuda-rhel$VERSION_ID.repo
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel$MAJOR_VERSION_ID/x86_64/cuda-rhel$MAJOR_VERSION_ID.repo
sudo yum clean all
}

install_cuda_from_package_manager() {
setup_repo
install_driver_package
# TODO(b/377558109): remove the temporary fix once the repo is updated
sudo yum -y install cuda-toolkit cuda-demo*
sudo yum -y install cuda-toolkit cuda-demo*
verify_driver
}

Expand All @@ -76,7 +84,7 @@ install_dcgm() {
try_install() {
# Export all functions for the bash subprocess
eval "$(declare -F | sed 's/ -f / -fx /')"
export VERSION_ID
export ID MAJOR_VERSION_ID VERSION_ID
for install_method in "$@"; do
echo "Installing NVIDIA driver and CUDA with $install_method..."
# Can't use a subshell because of https://lists.gnu.org/archive/html/bug-bash/2012-12/msg00094.html
Expand Down Expand Up @@ -114,9 +122,10 @@ handle_common() {
# Ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#removing-cuda-toolkit-and-driver
sudo yum -y module remove --all nvidia-driver
}

}

case "$VERSION_ID" in
case "$MAJOR_VERSION_ID" in
7) handle_rhel7;;
*) handle_common;;
esac
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
set -e
source /etc/os-release
VERSION_ID=${VERSION_ID%%.*}
MAJOR_VERSION_ID=${VERSION_ID%%.*}

verify_driver() {
# Verify NVIDIA driver:
Expand All @@ -18,24 +18,32 @@ install_cuda_from_runfile() {
# Remove existing installation before using the runfile
remove_cuda_package
remove_driver_package
# For Rocky Linux 9: when a new OS version becomes available, the default
# repo setting (/etc/yum.repos.d/rocky.repo) will automatically point to the
# new version's repo. This is problematic since the new OS is not available
# right away on GCE. Set up the matched repo to install the correct
# kernel-devel-$(uname -r)
# Not needed for RL8 since 8.10 is already the last RL8 release.
if [[ $ID == rocky && "${MAJOR_VERSION_ID}" == 9 ]]; then
cat <<EOF | sudo tee /etc/yum.repos.d/rocky-matched.repo
[appstream-matched]
name=Rocky Linux \$releasever - AppStream - Matched
baseurl=https://dl.rockylinux.org/vault/rocky/$VERSION_ID/AppStream/\$basearch/os/
gpgcheck=1
enabled=1
countme=1
metadata_expire=6h
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9
EOF
fi
sudo yum install -y kernel-devel-$(uname -r) pciutils gcc make wget yum-utils

# Installing latest version of NVIDIA CUDA and driver
# Data Center/Tesla drivers and CUDA are released on different schedules;
# normally we install the matching versions of driver and CUDA
# ($DRIVER_VERSION == $CUDA_BUNDLED_DRIVER_VERSION); due to https://github.com/NVIDIA/open-gpu-kernel-modules/issues/550
# we install a newer version of the driver
local DRIVER_VERSION=535.129.03
local CUDA_VERSION=12.2.2
local CUDA_BUNDLED_DRIVER_VERSION=535.104.05
echo "Installing NVIDIA Data Center driver $DRIVER_VERSION"
curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent
# Install the CUDA toolkit only, so that the CUDA toolkit uses the Data Center driver installed in the previous step
# See https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/ for CUDA and driver compatibility
local CUDA_VERSION=12.6.3
local CUDA_BUNDLED_DRIVER_VERSION=560.35.05
echo "Installing CUDA Toolkit $CUDA_VERSION from CUDA installer with bundled driver $CUDA_BUNDLED_DRIVER_VERSION"
curl -fSsl -O https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION/local_installers/cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run
sudo sh cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run --toolkit --silent
sudo sh cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run --silent
verify_driver
}

Expand All @@ -44,7 +52,7 @@ setup_repo() {
# Ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#prepare-rhel-9-rocky-9
sudo yum install -y yum-utils epel-release
sudo yum-config-manager \
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel$VERSION_ID/x86_64/cuda-rhel$VERSION_ID.repo
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel$MAJOR_VERSION_ID/x86_64/cuda-rhel$MAJOR_VERSION_ID.repo
sudo yum clean all
}

Expand Down Expand Up @@ -76,7 +84,7 @@ install_dcgm() {
try_install() {
# Export all functions for the bash subprocess
eval "$(declare -F | sed 's/ -f / -fx /')"
export VERSION_ID
export ID MAJOR_VERSION_ID VERSION_ID
for install_method in "$@"; do
echo "Installing NVIDIA driver and CUDA with $install_method..."
# Can't use a subshell because of https://lists.gnu.org/archive/html/bug-bash/2012-12/msg00094.html
Expand Down Expand Up @@ -114,9 +122,10 @@ handle_common() {
# Ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#removing-cuda-toolkit-and-driver
sudo yum -y module remove --all nvidia-driver
}

}

case "$VERSION_ID" in
case "$MAJOR_VERSION_ID" in
7) handle_rhel7;;
*) handle_common;;
esac
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
set -e
source /etc/os-release
VERSION_ID=${VERSION_ID%%.*}
MAJOR_VERSION_ID=${VERSION_ID%%.*}

verify_driver() {
# Verify NVIDIA driver:
Expand All @@ -18,24 +18,32 @@ install_cuda_from_runfile() {
# Remove existing installation before using the runfile
remove_cuda_package
remove_driver_package
# For Rocky Linux 9: when a new OS version becomes available, the default
# repo setting (/etc/yum.repos.d/rocky.repo) will automatically point to the
# new version's repo. This is problematic since the new OS is not available
# right away on GCE. Set up the matched repo to install the correct
# kernel-devel-$(uname -r)
# Not needed for RL8 since 8.10 is already the last RL8 release.
if [[ $ID == rocky && "${MAJOR_VERSION_ID}" == 9 ]]; then
cat <<EOF | sudo tee /etc/yum.repos.d/rocky-matched.repo
[appstream-matched]
name=Rocky Linux \$releasever - AppStream - Matched
baseurl=https://dl.rockylinux.org/vault/rocky/$VERSION_ID/AppStream/\$basearch/os/
gpgcheck=1
enabled=1
countme=1
metadata_expire=6h
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9
EOF
fi
sudo yum install -y kernel-devel-$(uname -r) pciutils gcc make wget yum-utils

# Installing latest version of NVIDIA CUDA and driver
# Data Center/Tesla drivers and CUDA are released on different schedules;
# normally we install the matching versions of driver and CUDA
# ($DRIVER_VERSION == $CUDA_BUNDLED_DRIVER_VERSION); due to https://github.com/NVIDIA/open-gpu-kernel-modules/issues/550
# we install a newer version of the driver
local DRIVER_VERSION=535.129.03
local CUDA_VERSION=12.2.2
local CUDA_BUNDLED_DRIVER_VERSION=535.104.05
echo "Installing NVIDIA Data Center driver $DRIVER_VERSION"
curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent
# Install the CUDA toolkit only, so that the CUDA toolkit uses the Data Center driver installed in the previous step
# See https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/ for CUDA and driver compatibility
local CUDA_VERSION=12.6.3
local CUDA_BUNDLED_DRIVER_VERSION=560.35.05
echo "Installing CUDA Toolkit $CUDA_VERSION from CUDA installer with bundled driver $CUDA_BUNDLED_DRIVER_VERSION"
curl -fSsl -O https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION/local_installers/cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run
sudo sh cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run --toolkit --silent
sudo sh cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run --silent
verify_driver
}

Expand All @@ -44,7 +52,7 @@ setup_repo() {
# Ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#prepare-rhel-9-rocky-9
sudo yum install -y yum-utils epel-release
sudo yum-config-manager \
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel$VERSION_ID/x86_64/cuda-rhel$VERSION_ID.repo
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel$MAJOR_VERSION_ID/x86_64/cuda-rhel$MAJOR_VERSION_ID.repo
sudo yum clean all
}

Expand All @@ -66,7 +74,7 @@ remove_cuda_package() {
try_install() {
# Export all functions for the bash subprocess
eval "$(declare -F | sed 's/ -f / -fx /')"
export VERSION_ID
export ID MAJOR_VERSION_ID VERSION_ID
for install_method in "$@"; do
echo "Installing NVIDIA driver and CUDA with $install_method..."
# Can't use a subshell because of https://lists.gnu.org/archive/html/bug-bash/2012-12/msg00094.html
Expand Down Expand Up @@ -107,7 +115,7 @@ handle_common() {

}

case "$VERSION_ID" in
case "$MAJOR_VERSION_ID" in
7) handle_rhel7;;
*) handle_common;;
esac
Expand Down
31 changes: 16 additions & 15 deletions transformation_test/transformation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -444,9 +444,24 @@ func (transformationConfig transformationTest) runOTelTestInner(t *testing.T, na
}

var errors []any
var exitErr error

// Read from stderr until EOF and put any errors in `errors`.
eg.Go(func() error {
// Wait for the process to exit.
defer eg.Go(func() error {
if err := cmd.Wait(); err != nil {
if _, ok := err.(*exec.ExitError); ok {
exitErr = err
t.Logf("process terminated with error: %v", err)
} else {
return fmt.Errorf("process failed: %w", err)
}
}
cancel()
return nil
})

consumingCount := 0
r := bufio.NewReader(stderr)
d := json.NewDecoder(r)
Expand Down Expand Up @@ -498,6 +513,7 @@ func (transformationConfig transformationTest) runOTelTestInner(t *testing.T, na
}
}
})

// Read and sanitize requests.
eg.Go(func() error {
for r := range requestCh {
Expand All @@ -506,21 +522,6 @@ func (transformationConfig transformationTest) runOTelTestInner(t *testing.T, na
return nil
})

var exitErr error
// Wait for the process to exit.
eg.Go(func() error {
if err := cmd.Wait(); err != nil {
if _, ok := err.(*exec.ExitError); ok {
exitErr = err
t.Logf("process terminated with error: %v", err)
} else {
return fmt.Errorf("process failed: %w", err)
}
}
cancel()
return nil
})

if err := eg.Wait(); err != nil {
t.Errorf("errgroup failed: %v", err)
}
Expand Down

0 comments on commit b8c2863

Please sign in to comment.