| #!/bin/bash |
| |
| # Copyright (C) 2018 Oracle. All Rights Reserved. |
| # |
| # Author: Darrick J. Wong <darrick.wong@oracle.com> |
| # |
| # This program is free software; you can redistribute it and/or |
| # modify it under the terms of the GNU General Public License |
| # as published by the Free Software Foundation; either version 2 |
| # of the License, or (at your option) any later version. |
| # |
| # This program is distributed in the hope that it would be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| # GNU General Public License for more details. |
| # |
| # You should have received a copy of the GNU General Public License |
| # along with this program; if not, write the Free Software Foundation, |
| # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. |
| |
| # Automatically check a LVM-managed filesystem online. |
| # We use lvm snapshots to do this, which means that we can only |
| # check filesystems in VGs that have at least 256MB (or so) of |
| # free space. |
| |
| snap_size_mb=256 |
| fstrim=0 |
| reap=0 |
| e2fsck_opts="" |
| conffile="@root_sysconfdir@/e2scrub.conf" |
| |
| test -f "${conffile}" && . "${conffile}" |
| |
| print_help() { |
| echo "Usage: $0 [OPTIONS] mountpoint | device" |
| echo |
| echo "mountpoint must be on a LVM-managed block device" |
| echo "-r: Remove e2scrub snapshot and exit, do not check anything." |
| echo "-t: Run fstrim if successful." |
| echo "-V: Print version information and exit." |
| } |
| |
| print_version() { |
| echo "e2scrub @E2FSPROGS_VERSION@ (@E2FSPROGS_DATE@)" |
| } |
| |
| exitcode() { |
| ret="$1" |
| |
| # If we're being run as a service, the return code must fit the LSB |
| # init script action error guidelines, which is to say that we |
| # compress all errors to 1 ("generic or unspecified error", LSB 5.0 |
| # section 22.2) and hope the admin will scan the log for what |
| # actually happened. |
| |
| # We have to sleep 2 seconds here because journald uses the pid to |
| # connect our log messages to the systemd service. This is critical |
| # for capturing all the log messages if the scrub fails, because the |
| # fail service uses the service name to gather log messages for the |
| # error report. |
| if [ -n "${SERVICE_MODE}" ]; then |
| test "${ret}" -ne 0 && ret=1 |
| sleep 2 |
| fi |
| |
| exit "${ret}" |
| } |
| |
| while getopts "rtV" opt; do |
| case "${opt}" in |
| "r") reap=1;; |
| "t") fstrim=1;; |
| "V") print_version; exitcode 0;; |
| *) print_help; exitcode 2;; |
| esac |
| done |
| shift "$((OPTIND - 1))" |
| |
| arg="$1" |
| if [ -z "${arg}" ]; then |
| print_help |
| exitcode 1 |
| fi |
| |
| # Find the device for a given mountpoint |
| dev_from_mount() { |
| local mountpt="$(realpath "$1")" |
| |
| lsblk -o NAME,FSTYPE,MOUNTPOINT -p -P -n 2> /dev/null | while read vars; do |
| eval "${vars}" |
| if [ "${mountpt}" != "${MOUNTPOINT}" ]; then |
| continue |
| fi |
| case "${FSTYPE}" in |
| ext[234]) |
| echo "${NAME}" |
| return 0 |
| ;; |
| esac |
| done |
| return 1 |
| } |
| |
| # Check a device argument |
| dev_from_arg() { |
| local dev="$1" |
| local fstype="$(lsblk -o FSTYPE -n "${dev}" 2> /dev/null)" |
| |
| case "${fstype}" in |
| ext[234]) |
| echo "${dev}" |
| return 0 |
| ;; |
| esac |
| return 1 |
| } |
| |
| mnt_from_dev() { |
| local dev="$1" |
| |
| if [ -n "${dev}" ]; then |
| lsblk -o MOUNTPOINT -n "${dev}" |
| fi |
| } |
| |
| # Construct block device path and mountpoint from argument |
| if [ -b "${arg}" ]; then |
| dev="$(dev_from_arg "${arg}")" |
| mnt="$(mnt_from_dev "${dev}")" |
| else |
| dev="$(dev_from_mount "${arg}")" |
| mnt="${arg}" |
| fi |
| if [ ! -e "${dev}" ]; then |
| echo "${arg}: Not an ext[234] filesystem." |
| print_help |
| exitcode 16 |
| fi |
| |
| # Make sure this is an LVM device we can snapshot |
| lvm_vars="$(lvs --nameprefixes -o name,vgname,lv_role --noheadings "${dev}" 2> /dev/null)" |
| eval "${lvm_vars}" |
| if [ -z "${LVM2_VG_NAME}" ] || [ -z "${LVM2_LV_NAME}" ] || |
| echo "${LVM2_LV_ROLE}" | grep -q "snapshot"; then |
| echo "${arg}: Not connnected to a LVM logical volume." |
| print_help |
| exitcode 16 |
| fi |
| start_time="$(date +'%Y%m%d%H%M%S')" |
| snap="${LVM2_LV_NAME}.e2scrub" |
| snap_dev="/dev/${LVM2_VG_NAME}/${snap}" |
| |
| teardown() { |
| # Remove and wait for removal to succeed. |
| ${DBG} lvremove -f "${LVM2_VG_NAME}/${snap}" 3>&- |
| while [ -e "${snap_dev}" ] && [ "$?" -eq "5" ]; do |
| sleep 0.5 |
| ${DBG} lvremove -f "${LVM2_VG_NAME}/${snap}" 3>&- |
| done |
| } |
| |
| check() { |
| # First we recover the journal, then we see if e2fsck tries any |
| # non-optimization repairs. If either of these two returns a |
| # non-zero status (errors fixed or remaining) then this fs is bad. |
| E2FSCK_FIXES_ONLY=1 |
| export E2FSCK_FIXES_ONLY |
| ${DBG} "@root_sbindir@/e2fsck" -E journal_only -p ${e2fsck_opts} "${snap_dev}" || return $? |
| ${DBG} "@root_sbindir@/e2fsck" -f -y ${e2fsck_opts} "${snap_dev}" |
| } |
| |
| mark_clean() { |
| ${DBG} "@root_sbindir@/tune2fs" -C 0 -T "${start_time}" "${dev}" |
| } |
| |
| mark_corrupt() { |
| ${DBG} "@root_sbindir@/tune2fs" -E force_fsck "${dev}" |
| } |
| |
| setup() { |
| # Try to remove snapshot for 30s, bail out if we can't remove it. |
| lveremove_deadline="$(( $(date "+%s") + 30))" |
| ${DBG} lvremove -f "${LVM2_VG_NAME}/${snap}" 3>&- 2>/dev/null |
| while [ -e "${snap_dev}" ] && [ "$?" -eq "5" ] && |
| [ "$(date "+%s")" -lt "${lvremove_deadline}" ]; do |
| sleep 0.5 |
| ${DBG} lvremove -f "${LVM2_VG_NAME}/${snap}" 3>&- |
| done |
| if [ -e "${snap_dev}" ]; then |
| echo "${arg}: e2scrub snapshot is in use, cannot check!" |
| return 1 |
| fi |
| # Create the snapshot, wait for device to appear. |
| ${DBG} lvcreate -s -L "${snap_size_mb}m" -n "${snap}" "${LVM2_VG_NAME}/${LVM2_LV_NAME}" 3>&- |
| if [ $? -ne 0 ]; then |
| echo "${arg}: e2scrub snapshot FAILED, will not check!" |
| return 1 |
| fi |
| ${DBG} udevadm settle 2> /dev/null |
| return 0 |
| } |
| |
| if [ "${reap}" -gt 0 ]; then |
| if [ -e "${snap_dev}" ]; then |
| teardown 2> /dev/null |
| fi |
| exit 0 |
| fi |
| if ! setup; then |
| exitcode 8 |
| fi |
| trap "teardown; exit 1" EXIT INT QUIT TERM |
| |
| # Check and react |
| check |
| case "$?" in |
| "0") |
| # Clean check! |
| echo "${arg}: Scrub succeeded." |
| mark_clean |
| teardown |
| trap '' EXIT |
| |
| # Trim the free space, which requires the snapshot be deleted. |
| if [ "${fstrim}" -eq 1 ] && [ -d "${mnt}" ] && type fstrim > /dev/null 2>&1; then |
| echo "${arg}: Trimming free space." |
| fstrim -v "${mnt}" |
| fi |
| |
| ret=0 |
| ;; |
| "8") |
| # Operational error, what now? |
| echo "${arg}: e2fsck operational error." |
| teardown |
| trap '' EXIT |
| ret=8 |
| ;; |
| *) |
| # fsck failed. Check if the snapshot is invalid; if so, make a |
| # note of that at the end of the log. This isn't necessarily a |
| # failure because the mounted fs could have overflowed the |
| # snapshot with regular disk writes /or/ our repair process |
| # could have done it by repairing too much. |
| # |
| # If it's really corrupt we ought to fsck at next boot. |
| is_invalid="$(lvs -o lv_snapshot_invalid --noheadings "${snap_dev}" | awk '{print $1}')" |
| if [ -n "${is_invalid}" ]; then |
| echo "${arg}: Scrub FAILED due to invalid snapshot." |
| ret=8 |
| else |
| echo "${arg}: Scrub FAILED due to corruption! Unmount and run e2fsck -y." |
| mark_corrupt |
| ret=6 |
| fi |
| teardown |
| trap '' EXIT |
| ;; |
| esac |
| |
| exitcode "${ret}" |