-
Notifications
You must be signed in to change notification settings - Fork 54
LVM fixes #714
LVM fixes #714
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,13 +19,16 @@ import ( | |
|
||
"github.com/prometheus/common/expfmt" | ||
v1 "k8s.io/api/core/v1" | ||
apierrors "k8s.io/apimachinery/pkg/api/errors" | ||
apierrs "k8s.io/apimachinery/pkg/api/errors" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
"k8s.io/apimachinery/pkg/util/wait" | ||
"k8s.io/kubernetes/test/e2e/framework" | ||
|
||
api "github.com/intel/pmem-csi/pkg/apis/pmemcsi/v1alpha1" | ||
|
||
"github.com/onsi/ginkgo" | ||
"github.com/onsi/gomega" | ||
) | ||
|
||
const ( | ||
|
@@ -181,6 +184,29 @@ func WaitForPMEMDriver(c *Cluster, name, namespace string) (metricsURL string) { | |
} | ||
} | ||
|
||
// CheckPMEMDriver does some sanity checks for a running deployment. | ||
func CheckPMEMDriver(c *Cluster, deployment *Deployment) { | ||
pods, err := c.cs.CoreV1().Pods(deployment.Namespace).List(context.Background(), | ||
metav1.ListOptions{ | ||
LabelSelector: fmt.Sprintf("%s in (%s)", deploymentLabel, deployment.Name), | ||
}, | ||
) | ||
framework.ExpectNoError(err, "list PMEM-CSI pods") | ||
gomega.Expect(len(pods.Items)).Should(gomega.BeNumerically(">", 0), "should have PMEM-CSI pods") | ||
for _, pod := range pods.Items { | ||
for _, containerStatus := range pod.Status.ContainerStatuses { | ||
if containerStatus.RestartCount > 0 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we should relax this condition to limit to only our driver container. Test failures also showed that
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or we fix this problem. If it's because of a timeout, then we may be able to increase that timeout. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's much simpler: that happened during the reboot tests, so of course all containers were restarted... I'm not sure yet how to deal with this. The check is useful because it highlights a problem that went unnoticed without it for a while. Perhaps clean up after the reboot tests by killing all pods? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case, checking if the pod container restart count == node reboot count might work. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've implemented the "delete pods" approach. Checking for node reboots is fragile and more complex. Is there even a counter for node restarts? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @avalluri okay now? Tests have passed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, we can merge. |
||
framework.Failf("container %q in pod %q restarted %d times, last state: %+v", | ||
containerStatus.Name, | ||
pod.Name, | ||
containerStatus.RestartCount, | ||
containerStatus.LastTerminationState, | ||
) | ||
} | ||
} | ||
} | ||
} | ||
|
||
// RemoveObjects deletes everything that might have been created for a | ||
// PMEM-CSI driver or operator installation (pods, daemonsets, | ||
// statefulsets, driver info, storage classes, etc.). | ||
|
@@ -559,6 +585,9 @@ func Parse(deploymentName string) (*Deployment, error) { | |
// a test runs, the desired deployment exists. Deployed drivers are intentionally | ||
// kept running to speed up the execution of multiple tests that all want the | ||
// same kind of deployment. | ||
// | ||
// The driver should never restart. A restart would indicate some | ||
// (potentially intermittent) issue. | ||
func EnsureDeployment(deploymentName string) *Deployment { | ||
deployment, err := Parse(deploymentName) | ||
if err != nil { | ||
|
@@ -588,6 +617,7 @@ func EnsureDeployment(deploymentName string) *Deployment { | |
// Do some sanity checks on the running deployment before the test. | ||
if deployment.HasDriver { | ||
WaitForPMEMDriver(c, "pmem-csi", deployment.Namespace) | ||
CheckPMEMDriver(c, deployment) | ||
} | ||
if deployment.HasOperator { | ||
WaitForOperator(c, deployment.Namespace) | ||
|
@@ -636,6 +666,7 @@ func EnsureDeployment(deploymentName string) *Deployment { | |
// looking at the driver state. Long-term we want the operator to do that | ||
// checking itself. | ||
WaitForPMEMDriver(c, "pmem-csi", deployment.Namespace) | ||
CheckPMEMDriver(c, deployment) | ||
} | ||
|
||
for _, h := range installHooks { | ||
|
@@ -695,6 +726,31 @@ func (d *Deployment) GetDriverDeployment() api.Deployment { | |
} | ||
} | ||
|
||
// DeleteAllPods deletes all currently running pods that belong to the deployment. | ||
func (d Deployment) DeleteAllPods(c *Cluster) error { | ||
listOptions := metav1.ListOptions{ | ||
LabelSelector: fmt.Sprintf("%s in (%s)", deploymentLabel, d.Name), | ||
} | ||
pods, err := c.cs.CoreV1().Pods(d.Namespace).List(context.Background(), listOptions) | ||
if err != nil { | ||
return fmt.Errorf("list all PMEM-CSI pods: %v", err) | ||
} | ||
// Kick of deletion of several pods at once. | ||
if err := c.cs.CoreV1().Pods(d.Namespace).DeleteCollection(context.Background(), | ||
metav1.DeleteOptions{}, | ||
listOptions, | ||
); err != nil { | ||
return fmt.Errorf("delete all PMEM-CSI pods: %v", err) | ||
} | ||
// But still wait for every single one to be gone... | ||
for _, pod := range pods.Items { | ||
if err := waitForPodDeletion(c, pod); err != nil { | ||
return fmt.Errorf("wait for pod deletion: %v", err) | ||
} | ||
} | ||
return nil | ||
} | ||
|
||
// DescribeForAll registers tests like gomega.Describe does, except that | ||
// each test will then be invoked for each supported PMEM-CSI deployment | ||
// which has a functional PMEM-CSI driver. | ||
|
@@ -781,3 +837,20 @@ func DefineTests() { | |
} | ||
} | ||
} | ||
|
||
// waitForPodDeletion returns an error if it takes too long for the pod to fully terminate. | ||
func waitForPodDeletion(c *Cluster, pod v1.Pod) error { | ||
return wait.PollImmediate(2*time.Second, time.Minute, func() (bool, error) { | ||
existingPod, err := c.cs.CoreV1().Pods(pod.Namespace).Get(context.Background(), pod.Name, metav1.GetOptions{}) | ||
if apierrors.IsNotFound(err) { | ||
return true, nil // done | ||
} | ||
if err != nil { | ||
return true, err // stop wait with error | ||
} | ||
if pod.UID != existingPod.UID { | ||
return true, nil // also done (pod was restarted) | ||
} | ||
return false, nil | ||
}) | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am surprised how this state of code ever worked, as it always returns error.... I am quite sure I wrote that if stmt there when this code was developed. But seems between commits 5365a34 and 0748a91 that whole function was moved, but leaving those (quite important ) if-lines behind. Just curious, what was reasoning to leave out if-part. And how the testing actually passed then. We dont have tests for that part?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
means, current fix is not new code, but actually restores code that was there originally.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You are right @okartau, It was unintentionally introduced by me while moving the code as part of removing init-container from the driver.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We do not have tests to cover this part of the code. Current tests cover only create, delete, and list devices on a pre-populated logical volume group.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We do go through this code once when installing a driver in LVM mode for the first time. The error return then causes a restart of the container. The next instance of the container then gets passed this error because it uses the then existing namespace instead of entering this faulty branch again.
The new check for "no restarts" failed when this line wasn't fixed and passed once it was.