Thanks for sharing your script, Fox. I expanded the functions to retrieve all current values from smartctl -a output. Overall the script only emails SMART drive temps and error status (I know this is redundant but I prefer to receive an email even when things are ok), but the function supports most of the standard SMART data so one could adapt it to do more, such as storing in a database as Fox suggested. I do realize that it's not as concise and simple as it could be but it works so I'm moving on. I basically just wanted something to monitor drive temps over time and also warn me of any dangerous temperature conditions.
I tested the script quite a bit but if anyone finds a bug please let me know. I plan on building on this code some day so please excuse all my commented out debug probes. In case you were wondering: the odd method of setting writestring = blah followed by file.write(writestring) will hopefully make it easier to update later for Python 3, which supports printing to files (avoiding annoying carriage return and other formatting hassles), which will make it much easier to flip back and forth to printing in the console window and printing to sendmail.
Note: Written and tested in FreeNAS 9.2.1.7 x64.
EDIT: You may notice that I put some crude warning messages in the email alerts, to warn in the event of a hard drive having SMART errors. As it is currently written (as of 11/21/14), it will not throw an error if the drive simply dies such that it does not respond to SMART requests. FreeNAS will through its own security/file system change warning emails, kind of, but this isn't a complete feature of the script and eventually I'll fix it. When I do, i'll be moving it to Github and post it in this thread.
Code:
import os,string,time,sys
sendmail = "/usr/sbin/sendmail" # sendmail location
customscriptlog_location = "/var/log/customscriptlog"
def Run_SmartCtl(strArgs):
cmdstring = "smartctl " + strArgs
the_output = os.popen(cmdstring).read()
lines = str.splitlines(the_output)
#debug probe
#print "number of lines in smartctl command output = ", len(lines)
#for line in lines:
#print line
return lines
def Get_Device_Ids():
lines = Run_SmartCtl("--scan")
Device_List = []
for line in lines:
device_id = string.split(line," ",1)[0]
Device_List.append(device_id)
#Device_List = ["/dev/da0"]
return Device_List
def Get_Device_Record(device_id):
dev = DeviceRecord()
dev_info_lines = Run_SmartCtl("-i " + device_id)
bEnteredInfoSection = False
dev.device_id = device_id
for line in dev_info_lines:
if ( not bEnteredInfoSection ):
TheFirstField = string.split(line," ",2)
if (TheFirstField[0].lower() == 'smartctl' ):
dev.smartctl_version = TheFirstField[1]
if ( "information section" in line.lower() ):
bEnteredInfoSection = True
else:
field = string.split(line,":",1)
if (field[0].lower() == "model family" ):
dev.family = field[1].strip()
elif (field[0].lower() == "device model" ):
dev.model = field[1].strip()
elif (field[0].lower() == "serial number" ):
dev.serial = field[1].strip()
elif (field[0].lower() == "firmware version" ):
dev.firmware_version = field[1].strip()
elif (field[0].lower() == "user capacity" ):
dev.capacity = field[1].strip()
elif (field[0].lower() == "sector sizes" ):
dev.sector_sizes = field[1].strip()
elif (field[0].lower() == "rotation rate" ):
dev.rotation_rate = field[1].strip()
elif (field[0].lower() == "device is" ):
dev.device_is = field[1].strip()
elif (field[0].lower() == "ata version is" ):
dev.ata_version = field[1].strip()
elif (field[0].lower() == "sata version is" ):
dev.sata_version = field[1].strip()
elif (field[0].lower() == "smart support is" ):
temp = string.split(field[1].strip()," ",1)
strTemp = temp[0].strip().lower()
if (strTemp == "available" ):
dev.smart_support_available = True
elif (strTemp == "unavailable" ):
dev.smart_support_available = False
elif (strTemp == "enabled" ):
dev.smart_support_enabled = True
elif (strTemp == "disabled" ):
dev.smart_support_enabled = False
#debug probe
#print "dev.family = " + dev.family
#print "dev.model = " + dev.model
#print "dev.serial = " + dev.serial
#print "dev.firmware_version = " + dev.firmware_version
#print "dev.capacity = " + dev.capacity
#print "dev.sector_sizes = " + dev.sector_sizes
#print "dev.rotation_rate = " + dev.rotation_rate
#print "dev.device_is = " + dev.device_is
#print "dev.ata_version = " + dev.ata_version
#print "dev.sata_version = " + dev.sata_version
#print "dev.smart_support_available = ", dev.smart_support_available
#print "dev.smart_support_enabled = ", dev.smart_support_enabled
return dev
def Get_Temp_Data(dev):
#only run if drive temp has not already been retrieved
if (dev.current_temp_celsius == -273 ):
dev_info_lines = Run_SmartCtl("-l scttemp " + dev.device_id )
for line in dev_info_lines:
TheFirstField = string.split(line," ",2)
field = string.split(line,":",1)
if (field[0].lower() == "current temperature" ):
#stripping out the temperature units
subfield = field[1].strip()
temp = subfield.split()
dev.current_temp_celsius = temp[0]
if (dev.current_temp_celsius == -273): #device does not support drive temp
dev.current_temp_celsius = "None"
dev.max_temp_celsius = "None"
if (dev.max_temp_celsius == 0): #device does not list max temp threshold
dev.max_temp_celsius = "None"
return dev
def Sort_Smart_Test_Logs(dev,full_output):
#filtering smart test logs to relevant lines and sorting the values in those lines into properties of the dev object
bSortingAttributeSection = False
bSortingErrorLogSection = False
bSortingSelftestSection = False
for line in full_output:
if ( not bSortingAttributeSection ):
if ( "smart attributes data structure" in line.lower() ):
bSortingAttributeSection = True
#sort SMART attributes section
if ( bSortingAttributeSection == True ):
field = line.split()
if (field[1].lower() == "reallocated_sector_ct" ):
dev.smartattrib_reallocated_sector_ct = field[9].strip()
elif (field[1].lower() == "power_on_hours" ):
dev.smartattrib_power_on_hours = field[9].strip()
elif (field[1].lower() == "power_cycle_count" ):
dev.smartattrib_power_cycle_count = field[9].strip()
elif (field[1].lower() == "wear_leveling_count" ):
dev.smartattrib_wear_leveling_count = field[9].strip()
elif (field[1].lower() == "used_rsvd_blk_cnt_tot" ):
dev.smartattrib_used_rsvd_blk_cnt_tot = field[9].strip()
elif (field[1].lower() == "program_fail_cnt_total" ):
dev.smartattrib_program_fail_cnt_total = field[9].strip()
elif (field[1].lower() == "erase_fail_count_total" ):
dev.smartattrib_erase_fail_count_total = field[9].strip()
elif (field[1].lower() == "runtime_bad_block" ):
dev.smartattrib_runtime_bad_block = field[9].strip()
elif (field[1].lower() == "reported_uncorrect" ):
dev.smartattrib_reported_uncorrect = field[9].strip()
elif (field[1].lower() == "airflow_temperature_cel" ):
dev.smartattrib_airflow_temperature_cel_threshold = field[5].strip()
dev.smartattrib_airflow_temperature_cel_value = field[9].strip()
#store device temp if it hasn't already been done
if (dev.current_temp_celsius == -273):
dev.max_temp_celsius = field[5].strip()
dev.max_temp_celsius = int(str(dev.max_temp_celsius)) #removes leading zeros
dev.current_temp_celsius = field[9].strip()
elif (field[1].lower() == "hardware_ecc_recovered" ):
dev.smartattrib_hardware_ecc_recovered = field[9].strip()
elif (field[1].lower() == "udma_crc_error_count" ):
dev.smartattrib_udma_crc_error_count = field[9].strip()
elif (field[1].lower() == "unknown_attribute" ):
dev.smartattrib_unknown_attribute = field[9].strip()
elif (field[1].lower() == "total_lbas_written" ):
dev.smartattrib_total_lbas_written = field[9].strip()
bSortingAttributeSection = False #last line of attribute section
if ( not bSortingErrorLogSection ):
if ( "smart error log version" in line.lower() ):
bSortingErrorLogSection = True
#sort SMART error log section
if (bSortingErrorLogSection == True ):
if (line.strip() == "" ):
bSortingErrorLogSection = False #last line of error log section
elif ("no errors logged" in line.lower() ):
dev.smart_error_logged = "No"
else:
dev.smart_error_logged = "YES!"
if ( not bSortingSelftestSection ):
if ("smart self-test log structure" in line.lower() ):
bSortingSelftestSection = True
#sort SMART self-test log section
if (bSortingSelftestSection == True):
if (len(line) == 0):
whocares = 1 #do nothing. this is the space between the sections
else:
field = line.split()
if (field[0] == "#"):
dev.smart_selftests_in_log = field[1].strip()
if ( "without error" in line.lower() ):
dev.smart_selftest_failed = "None"
else:
dev.smart_selftest_failed = "Test ", field[3], " failed"
return dev
class DeviceRecord:
#version of SmartCtl used to retrieve device records
smartctl_version = 0.0
#SMART device properties
device_id = ""
family = ""
model = ""
serial = ""
firmware_version = ""
capacity = ""
sector_sizes = ""
rotation_rate = ""
device_is = ""
ata_version = ""
sata_version = ""
smart_support_available = False
smart_support_enabled = False
current_temp_celsius = -273
max_temp_celsius = -273
#SMART attribute values
smartattrib_reallocated_sector_ct = ""
smartattrib_power_on_hours = ""
smartattrib_power_cycle_count = ""
smartattrib_wear_leveling_count = ""
smartattrib_used_rsvd_blk_cnt_tot = ""
smartattrib_program_fail_cnt_total = ""
smartattrib_erase_fail_count_total = ""
smartattrib_runtime_bad_block = ""
smartattrib_reported_uncorrect = ""
smartattrib_airflow_temperature_cel_value = ""
smartattrib_airflow_temperature_cel_threshold = ""
smartattrib_hardware_ecc_recovered = ""
smartattrib_udma_crc_error_count = ""
smartattrib_unknown_attribute = ""
smartattrib_total_lbas_written = ""
#SMART error logs
smart_error_logged = ""
#SMART self-test log
smart_selftests_in_log = ""
smart_selftest_failed = ""
def RemoveDupes(device_records):
dedupedlist = []
for device in device_records:
currentSerial = device.serial
present = False
for item in dedupedlist:
if (item.serial == currentSerial):
present = True
if ( not present ):
dedupedlist.append(device)
return dedupedlist
def RemoveDisabledUnsupported(devices):
compatibledevices = []
for device in devices:
if ( device.smart_support_available == True and device.smart_support_enabled == True ):
compatibledevices.append(device)
return compatibledevices
#########################################################################
#enumerate devices, get their device_ids (e.g. /dev/da0)
alldeviceids = Get_Device_Ids()
#debug probe
#alldeviceids = ["/dev/ada0"]
if ( [] == alldeviceids ): print "No devices found."
#obtain device properties
list_of_device_records = []
for device_id in alldeviceids:
device = Get_Device_Record(device_id)
#debug probe
#print "------------------------------"
#print "Looking up with Get_Device_Record: " + device_id
#print "Some of the records found: "
#print device.device_id, device.serial, device.smart_support_available, device.smart_support_enabled
list_of_device_records.append(device)
#remove any duplicate device record lines
dedupedlist = RemoveDupes(list_of_device_records)
#remove devices that indicated that SMART support is unavailable or disabled
compatibledevices = RemoveDisabledUnsupported(dedupedlist)
#debug probe
#compatibledevices = dedupedlist
#compatibledevices = list_of_device_records
if ( [] == compatibledevices): print "No compatible devices."
#debug probe:
#print "Total device IDs found = ", len(alldeviceids)
#print "Contents of alldeviceids:"
#print '[%s]' % ', '.join(map(str, alldeviceids))
#print "--------------------------------------------------------------"
#print "Number of items in list_of_device_records = ", len(list_of_device_records)
#print "Contents of list_of_device_records:"
#for line in list_of_device_records:
#print line
#print "--------------------------------------------------------------"
#print "Number of lines in deduped device records = ", len(dedupedlist)
#print "Contents of dedupedlist:"
#for line in dedupedlist:
#print line
#print "--------------------------------------------------------------"
#print "Number of compatible devices = ", len(compatibledevices)
#print "Contents of compatibledevices:"
#for line in compatibledevices:
#print line
#print "--------------------------------------------------------------"
#sort smart log data
for dev in compatibledevices:
Sort_Smart_Test_Logs(dev,Run_SmartCtl("-a " + dev.device_id))
#debug probe
#print "--------------------------------------------------------------"
#print "Logs obtained for Device ID = " + dev.device_id
#print "SMART attribute values:"
#print "dev.smartattrib_reallocated_sector_ct = " + dev.smartattrib_reallocated_sector_ct
#print "dev.smartattrib_power_on_hours = " + dev.smartattrib_power_on_hours
#print "dev.smartattrib_wear_leveling_count = " + dev.smartattrib_wear_leveling_count
#print "dev.smartattrib_used_rsvd_blk_cnt_tot = " + dev.smartattrib_used_rsvd_blk_cnt_tot
#print "dev.smartattrib_program_fail_cnt_total = " + dev.smartattrib_program_fail_cnt_total
#print "dev.smartattrib_erase_fail_count_total = " + dev.smartattrib_erase_fail_count_total
#print "dev.smartattrib_runtime_bad_block = " + dev.smartattrib_runtime_bad_block
#print "dev.smartattrib_reported_uncorrect = " + dev.smartattrib_reported_uncorrect
#print "dev.smartattrib_airflow_temperature_cel_value = " + dev.smartattrib_airflow_temperature_cel_value
#print "dev.smartattrib_airflow_temperature_cel_threshold = " + dev.smartattrib_airflow_temperature_cel_threshold
#print "dev.smartattrib_hardware_ecc_recovered = " + dev.smartattrib_hardware_ecc_recovered
#print "dev.smartattrib_udma_crc_error_count = " + dev.smartattrib_udma_crc_error_count
#print "dev.smartattrib_unknown_attribute = " + dev.smartattrib_unknown_attribute
#print "dev.smartattrib_total_lbas_written = " + dev.smartattrib_total_lbas_written
#print "SMART Error Log:"
#print "dev.smart_error_logged = " + dev.smart_error_logged
#print "SMART Self-Test Log:"
#print "dev.smart_selftests_in_log = " + dev.smart_selftests_in_log
#print "dev.smart_selftest_failed = ", dev.smart_selftest_failed
#raw_input ("paused")
#check device_ids for SMART errors or overtemp condition, also truncate dev.model entries to 11 characters
device_ids_with_smart_errors = []
device_ids_too_hot = []
device_ids_no_temp_support = []
for dev in compatibledevices:
#truncate dev.model to 11 characters
dev.model = dev.model[:11]
#check device_ids for SMART errors or overtemp condition
Get_Temp_Data(dev) #set unsupported temp fields to None
if ("yes" in dev.smart_error_logged.lower() or not "none" in dev.smart_selftest_failed.lower() == "none"):
device_ids_with_smart_errors.append(dev.device_id)
if ("none" in str(dev.max_temp_celsius).lower() and not "none" in str(dev.current_temp_celsius).lower() ): #if only max temp is unsupported
if (int(dev.current_temp_celsius) > 43 ):
device_ids_too_hot.append(dev.device_id)
elif ("none" in str(dev.current_temp_celsius).lower() ) : #if temp is not at all supported
device_ids_no_temp_support.append(dev.device_id)
else:
if (int(dev.current_temp_celsius) > int(dev.max_temp_celsius)-2 ): #temp and max temp is supported
device_ids_too_hot.append(dev.device_id)
print dev.device_id + " too hot. temp = ", int(dev.current_temp_celsius), ", max-2 = ", int(dev.max_temp_celsius)-2
#open pipe to sendmail
#smartstatus_email = open ("smartstatus_email.txt", "w") #for writing to file instead of sendmail
smartstatus_email = os.popen("%s -t -i" % sendmail, "w")
#generate email message
smartstatus_email.write( "To: [EMAIL ADDRESS]\n")
if (not device_ids_with_smart_errors == [] ):
smartstatus_email.write( "Subject: SMART ERRORS DETECTED - local.freenas\n" )
elif (not device_ids_too_hot == [] ):
smartstatus_email.write( "Subject: DRIVES TOO HOT - local.freenas\n" )
else:
smartstatus_email.write( "Subject: SMART OK - local.freenas - drive temps & SMART logs\n" )
smartstatus_email.write( "\n" )
#print general system info
smartstatus_email.write( "==============================================================================================\n" )
writestring = "Output for smartstatus.py " + time.strftime("%m/%d/%y") + " " + time.strftime("%H:%M:%S") + "\n"
smartstatus_email.write( writestring )
writestring = "SmartCtl version = " + str(compatibledevices[0].smartctl_version) + " (script written for SmartCtl 6.2)\n"
smartstatus_email.write( writestring )
writestring = "Python version = " + str(sys.version_info[0]) + "." + str(sys.version_info[1]) + "." + str(sys.version_info[2]) + " (script written for Python 2.7)\n"
smartstatus_email.write( writestring )
writestring = "Total devices detected = " + str(len(alldeviceids)) + "\n"
smartstatus_email.write( writestring )
writestring = "SMART compatible devices = " + str(len(compatibledevices)) + "\n"
smartstatus_email.write( writestring )
writestring = "==============================================================================================\n"
smartstatus_email.write( writestring )
smartstatus_email.write( "{0:12}{1:13}{2:16}{3:10}{4:12}{5:14}{6:18}".format("Device ID", "Model", "Serial", "Temp/Max", "SMART Logs", "SMART Errors", "Self-Test Errors") )
smartstatus_email.write( "\n" )
#print stats for each device
for x in compatibledevices:
smartstatus_email.write( "{0:12}{1:13}{2:16}{3:10}{4:12}{5:14}{6:18}".format(x.device_id, x.model, x.serial, str(x.current_temp_celsius) + "/" + str(x.max_temp_celsius), x.smart_selftests_in_log ,x.smart_error_logged, x.smart_selftest_failed) )
smartstatus_email.write( "\n" )
writestring = "==============================================================================================\n"
smartstatus_email.write( writestring )
#print some info about problems, if any
writestring = "Automated warnings: (even if nothing is here then there still may be bad things happening)\n"
smartstatus_email.write ( writestring )
if (len(compatibledevices) < len(alldeviceids)):
int_unsupported_devices = len(compatibledevices) - len(alldeviceids)
writestring = "\n"
smartstatus_email.write ( writestring )
writestring = "Warning: " + str(int_unsupported_devices) + " unsupported devices were detected. This may be OK, or it may indicate that the script isn't executing properly, or doesn't match the version of smartctl that it was developed on. Examine devices with smartctl in a shell prompt to investigate.\n"
smartstatus_email.write( writestring )
if (not device_ids_with_smart_errors == [] ):
smartstatus_email.write( " " )
for dev in device_ids_with_smart_errors:
writestring = "WARNING! Device " + dev + " has error(s)!\n"
smartstatus_email.write( writestring )
writestring = "Type 'smartctl -a /device/ID' in server shell for more info!\n"
smartstatus_email.write( writestring )
if (not device_ids_too_hot == [] ):
for dev in device_ids_too_hot:
writestring = "WARNING! Device " + dev + " is running hot!\n"
smartstatus_email.write( writestring )
smartstatus_email.write( "Type 'smartctl -a /device/ID' in server shell for more info!\n" )
smartstatus_email.write( writestring )
if (not device_ids_no_temp_support == [] ):
writestring = ""
smartstatus_email.write( writestring )
writestring = "*Notice: The following SMART device(s) do not support SMART drive temperatures:\n"
smartstatus_email.write( writestring )
smartstatus_email.write( '[%s]' % ', '.join(map(str, device_ids_no_temp_support)) )
#sendmail sends upon closing
smstatus = smartstatus_email.close()
if (not smstatus == 0 ): #sendmail returns 0 IFF no errors
#if sendmail returns error, complain into customscriptlog
customscriptlog = open (customscriptlog_location, "a") # a = append file
customscriptlog.write("smartstatus.py: "+ time.strftime("%m/%d/%y") + " " + time.strftime("%H:%M:%S") + " Sendmail exit status = " + str(smstatus) + "\n")
print "Sendmail exit status has error code = " + str(smstatus)
customscriptlog.close()