Troubleshooting and Debugging
Pod Troubleshooting
# Debug running pod
kubectl debug POD_NAME -it \
--image=busybox \
--target=CONTAINER_NAME
# Create debugging pod
kubectl run debug-pod \
--image=nicolaka/netshoot \
--rm -it -- /bin/bash
# Check pod events
kubectl get events --field-selector involvedObject.name=POD_NAME
# Get pod details
kubectl get pod POD_NAME -o yaml
kubectl describe pod POD_NAME
# Check pod logs with previous instances
kubectl logs POD_NAME --previous
kubectl logs POD_NAME -c CONTAINER_NAME --previous
# Port forward for debugging
kubectl port-forward pod/POD_NAME 8080:80
Cluster Diagnostics
# Check cluster health
kubectl get componentstatuses
kubectl get nodes -o custom-columns=NAME:.metadata.name,STATUS:.status.conditions[?(@.type=="Ready")].status
# Check system pods
kubectl get pods -n kube-system
kubectl top nodes
kubectl top pods --all-namespaces
# API server availability
kubectl get --raw /healthz
kubectl get --raw /metrics
# Check cluster events
kubectl get events --sort-by=.metadata.creationTimestamp
# Audit pod placement
kubectl get pods -o wide --all-namespaces | \
grep -v Running | \
grep -v Completed
Network Diagnostics
# Test network connectivity
kubectl run test-dns \
--image=busybox:1.28 \
--rm -it \
-- nslookup kubernetes.default
# Check service endpoints
kubectl get endpoints SERVICE_NAME
kubectl describe endpoints SERVICE_NAME
# Test service connectivity
kubectl run curl \
--image=curlimages/curl \
--rm -it \
-- curl http://SERVICE_NAME.NAMESPACE.svc.cluster.local
# DNS debugging
kubectl run dnsutils \
--image=gcr.io/kubernetes-e2e-test-images/dnsutils:1.3 \
--rm -it \
-- bash
Monitoring and Logging
Resource Monitoring
# Monitor resource usage
kubectl top nodes --sort-by=cpu
kubectl top pods --sort-by=memory
# Get metrics for specific pod
kubectl top pod POD_NAME --containers
# Monitor pod resource usage over time
kubectl top pods --all-namespaces \
--sort-by=cpu \
--no-headers | \
while read line; do
echo "$(date '+%Y-%m-%d %H:%M:%S') $line" >> pod_metrics.log
done
Log Collection
# Collect logs from all containers
kubectl logs -l app=nginx --all-containers=true
# Export logs to file
kubectl logs POD_NAME > pod.log
# Stream logs from multiple pods
kubectl logs -f -l app=nginx --all-containers=true
# Get logs with timestamp
kubectl logs POD_NAME --timestamps=true
# Aggregate logs script
cat <<'EOF' > collect-logs.sh
#!/bin/bash
NAMESPACE=$1
LABEL_SELECTOR=$2
OUTPUT_DIR="cluster-logs-$(date +%Y%m%d-%H%M%S)"
mkdir -p "$OUTPUT_DIR"
# Collect pod logs
kubectl get pods -n "$NAMESPACE" -l "$LABEL_SELECTOR" -o name | \
while read pod; do
pod_name=$(basename "$pod")
mkdir -p "$OUTPUT_DIR/$pod_name"
# Get pod details
kubectl describe pod "$pod_name" -n "$NAMESPACE" > \
"$OUTPUT_DIR/$pod_name/describe.txt"
# Get container logs
kubectl get pod "$pod_name" -n "$NAMESPACE" \
-o jsonpath='{.spec.containers[*].name}' | \
tr ' ' '\n' | \
while read container; do
kubectl logs "$pod_name" -c "$container" -n "$NAMESPACE" \
--timestamps > \
"$OUTPUT_DIR/$pod_name/$container.log"
done
done
# Collect events
kubectl get events -n "$NAMESPACE" \
--sort-by='.lastTimestamp' > \
"$OUTPUT_DIR/events.txt"
# Create archive
tar czf "$OUTPUT_DIR.tar.gz" "$OUTPUT_DIR"
rm -rf "$OUTPUT_DIR"
EOF
chmod +x collect-logs.sh
Advanced Scripting and Automation
Deployment Automation
# Rolling update script
cat <<'EOF' > rolling-update.sh
#!/bin/bash
DEPLOYMENT=$1
NEW_IMAGE=$2
NAMESPACE=${3:-default}
# Update image
kubectl set image deployment/$DEPLOYMENT \
*=$NEW_IMAGE -n $NAMESPACE
# Watch rollout status
kubectl rollout status deployment/$DEPLOYMENT -n $NAMESPACE
# Verify deployment
if [ $? -eq 0 ]; then
echo "Deployment successful"
# Run tests here
else
echo "Deployment failed"
kubectl rollout undo deployment/$DEPLOYMENT -n $NAMESPACE
exit 1
fi
EOF
# Health check script
cat <<'EOF' > health-check.sh
#!/bin/bash
NAMESPACE=${1:-default}
THRESHOLD=${2:-80}
check_resources() {
# Check CPU usage
kubectl top pods -n $NAMESPACE | \
awk -v threshold=$THRESHOLD 'NR>1 {
split($3, cpu, "m")
if (cpu[1] > threshold) {
print $1 " CPU: " $3
}
}'
# Check memory usage
kubectl top pods -n $NAMESPACE | \
awk -v threshold=$THRESHOLD 'NR>1 {
split($4, mem, "Mi")
if (mem[1] > threshold) {
print $1 " Memory: " $4
}
}'
}
# Check pod status
kubectl get pods -n $NAMESPACE | \
awk 'NR>1 && $3!="Running" && $3!="Completed" {
print "Pod " $1 " is in state " $3
}'
# Run resource checks
check_resources
EOF
Backup and Restore
# Backup script
cat <<'EOF' > backup-resources.sh
#!/bin/bash
BACKUP_DIR="k8s-backup-$(date +%Y%m%d-%H%M%S)"
mkdir -p "$BACKUP_DIR"
# Backup all resources
for resource in $(kubectl api-resources --verbs=list --namespaced -o name); do
echo "Backing up $resource"
kubectl get "$resource" \
--all-namespaces \
-o yaml > \
"$BACKUP_DIR/$resource.yaml"
done
# Backup non-namespaced resources
for resource in $(kubectl api-resources --verbs=list --namespaced=false -o name); do
echo "Backing up $resource"
kubectl get "$resource" \
-o yaml > \
"$BACKUP_DIR/$resource.yaml"
done
# Create archive
tar czf "$BACKUP_DIR.tar.gz" "$BACKUP_DIR"
rm -rf "$BACKUP_DIR"
EOF
# Restore script
cat <<'EOF' > restore-resources.sh
#!/bin/bash
BACKUP_FILE=$1
if [ ! -f "$BACKUP_FILE" ]; then
echo "Backup file not found"
exit 1
fi
# Extract backup
tar xzf "$BACKUP_FILE"
BACKUP_DIR=$(basename "$BACKUP_FILE" .tar.gz)
# Restore resources
for yaml in "$BACKUP_DIR"/*.yaml; do
echo "Restoring $(basename "$yaml")"
kubectl apply -f "$yaml"
done
rm -rf "$BACKUP_DIR"
EOF
Best Practices
Security Best Practices
# Pod security context
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: secure-pod
spec:
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 2000
containers:
- name: app
image: nginx
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
EOF
# Network policies
cat <<EOF | kubectl apply -f -
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: default-deny
spec:
podSelector: {}
policyTypes:
- Ingress
- Egress
EOF
Resource Management Best Practices
# Resource requests and limits
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: resource-pod
spec:
containers:
- name: app
image: nginx
resources:
requests:
memory: "64Mi"
cpu: "250m"
limits:
memory: "128Mi"
cpu: "500m"
EOF
# Horizontal Pod Autoscaling
cat <<EOF | kubectl apply -f -
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: app-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: app
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 80
EOF
Remember:
- Always use resource requests and limits
- Implement proper security contexts
- Use network policies
- Regular backup of critical resources
- Monitor resource usage
- Implement proper logging
- Use namespaces for isolation
- Regular security audits
- Document all configurations
- Use version control for manifests
For detailed information, consult the Kubernetes documentation and kubectl command reference (kubectl help
).
Would you like me to cover any specific aspect in more detail?