# $Header: emdb/sysman/admin/scripts/alertlog.pl /st_emdbsa_11.2/2 2008/12/22 10:30:22 ychan Exp $ # # alertlog.pl # # Copyright (c) 2002, 2008, Oracle and/or its affiliates.All rights reserved. # # NAME # alertlog.pl - scans an alert log file for patterns # # DESCRIPTION # Given a filename and a pattern, alertlog greps thru an oracle log # looking for one pattern while (optionally) ignoring another. Matching # lines are returned in a collection along with the most recent time # stamp logged. # # NOTES # The output format (column list) of this script is dynamic: the number of # columns == n+5, n==EM_METRIC_ERROR_PATTERNS. # 1: Timestamp/Line Number # 2: Errors matching no patterns (generic) # 3..(n+2): Errors matching patterns # EM_METRIC_ERROR_PATTERN1..EM_METRIC_ERROR_PATTERNn # n+3: Trace file # n+4: Error code list # n+5: Alert log filename # # MODIFIED (MM/DD/YY) # ychan 12/10/08 - XbranchMerge ychan_bug-6343005 from # st_emdbgc_10.2.0.1.0 # ychan 11/26/08 - Fix bug 6343005 # ajdsouza 10/28/06 - Big fix 7422639 # mkiran 03/10/07 - 5711362: Report deadlock (ORA-0*60) at GC. # hying 12/10/05 - Backport hying_bug-4734310 from main # hying 12/01/05 - 4734310, NT CRSAlert fix # ychan 09/16/05 - Fix bug 4613946 # hying 08/11/05 - 4547860 # hying 04/08/05 - no generic error for CRS # hying 01/03/05 - errorPrefix for alertlog # jsoule 11/29/04 - accept filename/timestamp pattern # jsoule 03/09/04 - provide for flexible columns # jsoule 01/22/04 - tune alertlog monitoring for large error stacks # jsoule 10/27/03 - report error if alert log dir not found # jsoule 10/03/03 - use agentStateDir to locate state # jsoule 09/04/03 - correct offset lookup # jsoule 08/29/03 - allow either alert log file format # jsoule 07/08/03 - improve scalability # jsoule 05/27/03 - discard incomplete error stacks # jsoule 04/14/03 - add 16038 to the archiver hung category # jsoule 03/28/03 - roll up all similar error stacks # jsoule 11/27/02 - check ignores after switching to specific error column # jsoule 11/26/02 - capture archiver errors appropriately # jsoule 11/06/02 - move general alertlog logic to alertlog_util # jsoule 10/25/02 - move metric state files from upload to state # jsoule 10/17/02 - remove unnecessary if & count lines on bad state # jsoule 10/15/02 - return a single-part key # jsoule 09/30/02 - do not return the results of the first scan # jsoule 09/04/02 - only rollup 600 & 7445 errors # jsoule 08/30/02 - change output behavior for "no data # jsoule 08/28/02 - fix trace file match # jsoule 08/22/02 - modify alert log metric implementation # ychan 07/11/02 - Filter db ora errors # xxu 06/24/02 - remove /usr/local/bin/perl # jsoule 04/05/02 - support timestamps between the 1st and 9th of the month. # jsoule 03/14/02 - return null for no problems; transient error code list. # prabuck 02/25/02 - changed target directory for writing alert log state files # prabuck 02/21/02 - removed parameters known to environment from command line # qsong 01/25/02 - Merged qsong_alertlog_again # prabuck 01/09/02 - Creation # # use/require appropriate libraries use strict; use Time::HiRes; use HTTP::Date; require "emd_common.pl"; require "alertlog_util.pl"; # Command Line Arguments my $log_file_absolute = shift(@ARGV); # log file (absolute) my $log_timestamp_format = shift(@ARGV); # log (entry) timestamp format my $errorPrefix = shift(@ARGV); # error prefix for alert log, ORA or CRS # If CRS, errorPrefix is in the format of CRS%CRSVersion% if ($errorPrefix =~ /CRS(.+)/) { if ($1 =~ /10.1/) { EMAGENT_PERL_INFO("CRS Alert is not supported in version $1"); exit 0; } ## Strip CRSVersion off $errorPrefix = 'CRS'; } my $backgroundDumpDest = shift(@ARGV); # background_dump_dest my $userDumpDest = shift(@ARGV); # user_dump_dest my $limitSwitch = shift(@ARGV); # The switch that limits the amount of # output this scan can produce. # Convention: > 0 => time (sec.) to spend # eq "EOF" => skip to the end # otherwise ignore my $ignorePattern = shift(@ARGV); # The pattern (a PERL regular expression) # for determining which log entries to # ignore. # End Arguments # get target_name, $ORACLE_HOME, and $ORACLE_SID from environment my $targetGuid = $ENV{EM_TARGET_GUID}; my $oracle_home = $ENV{EM_TARGET_ORACLE_HOME}; my $SID = $ENV{EM_TARGET_ORACLE_SID}; if (EMAGENT_isPerlDebugEnabled()) { EMAGENT_PERL_DEBUG("target: $targetGuid; (pattern to suppress: $ignorePattern)"); } # establish dump destinations $backgroundDumpDest = get_absolute_dump_dest($backgroundDumpDest, $oracle_home); $userDumpDest = get_absolute_dump_dest($userDumpDest, $oracle_home); if ($log_file_absolute eq "[MISSING_LOG]") { # silently quit if the alert log file has been removed EMAGENT_PERL_INFO("No log file provided."); exit 0; } else { if (EMAGENT_isPerlDebugEnabled()) { EMAGENT_PERL_DEBUG("log file (absolute): $log_file_absolute."); } } # 4734310 if ($^O =~ m/MSWin32/) { $log_file_absolute =~ s/\//\\/g; } my @path_comps = split(($^O =~ m/MSWin32/ ? "\\\\" : "\/"), $log_file_absolute); my $log_file_relative = $path_comps[$#path_comps]; my $state_root = $ENV{EM_AGENT_STATE_DIR}; # establish the columns that are not "generic" my @columnPatterns = (); &initializeColumnPatterns; # extract array of numbers (31, 023, etc.) from log_timestamp_format if (EMAGENT_isPerlDebugEnabled()) { EMAGENT_PERL_DEBUG("log timestamp format spec: $log_timestamp_format."); } my @max = split(/\D/, $log_timestamp_format); my $max = \@max; # translate format into regular expression $log_timestamp_format =~ s/\d+/(\\d+)/g; if (EMAGENT_isPerlDebugEnabled()) { EMAGENT_PERL_DEBUG("log timestamp format regexp: $log_timestamp_format."); } # # Check the accessibility of the alert log. # if (!open(LOG,"< $log_file_absolute")) { # This can happen in two cases: # 1) Alert log exists (i.e., was identifiable) but is not readable. This # certainly warrants a warning message. # 2) Alert log does not exist, and has been removed since the log file was # identified. This is not technically a warning situation -- ideally, # the alert log identification should be cleared, but at this point # that's not feasible. if (EMAGENT_isPerlWarningEnabled()) { EMAGENT_PERL_WARN("target: $targetGuid; unable to open alert log at $log_file_absolute"); } exit 0; } # # Build the fully qualify scanned file path. # my $separator = $^O =~ m/MSWin32/ ? "\\" : "\/"; # # Location of the offsets state file is # $EMDROOT/sysman/emd/state/. # my $scannedFile = $state_root.$separator."sysman".$separator."emd".$separator."state".$separator."$targetGuid.$log_file_relative"; # capture start time, in seconds and track the number of stacks processed so # the time check is performed relatively infrequently. my $start_time = Time::HiRes::time; my $stacks_processed = 0; # # Limit switch must be: # - a positive integer (representing seconds) # - the string "EOF" (indicating that the tail should be skipped) # - the string "tail" (indicating the normal behavior, process the entire tail) # When it's something other than these, the "tail" behavior is followed. # if ($limitSwitch =~ m/^\d+$/) { if (EMAGENT_isPerlDebugEnabled()) { EMAGENT_PERL_DEBUG("Stopping soon after $limitSwitch seconds."); } } elsif ($limitSwitch eq "EOF") { if (EMAGENT_isPerlDebugEnabled()) { EMAGENT_PERL_DEBUG("Skipping remaining entries in $log_file_absolute"); } } else { if ($limitSwitch ne "tail") { if (EMAGENT_isPerlWarningEnabled()) { EMAGENT_PERL_WARN("Invalid limit switch specification ($limitSwitch) -- using 'tail' option"); } } $limitSwitch = 0; } my $offset = 0; my $line = 0; # # Open scanned file for reading (if scanned file does not exist, create it.) # if (!open(SCANNED, "+>> $scannedFile")) { EMAGENT_PERL_ERROR("target: $targetGuid; Cannot open $scannedFile for read/write."); exit 1; } # seek to top of file seek(SCANNED, 0, 0); # file only contains a single line with two numbers: an offset and line number # get these and close the scanned file while () { my @pos = split(' ', $_); my $pos = \@pos; $offset = $pos->[0]; $line = $pos->[1]; } close(SCANNED); my @errStackKeys; # "keys" for already seen error stacks my $initialTimestamp = ""; # timestamp for the current error stack if ($offset == 0 || $limitSwitch eq "EOF") { if (EMAGENT_isPerlDebugEnabled()) { EMAGENT_PERL_DEBUG("target $targetGuid; counting alert log lines only"); } if ($offset == 0) { # # Trust neither offset nor line, and start counting from the beginning. # $line = 0; } else { # # Trust both offset and line, and start counting from the offset. # seek(LOG, $offset, 0); } while () { $line++; } } else { if (EMAGENT_isPerlDebugEnabled()) { EMAGENT_PERL_DEBUG("target $targetGuid; processing alert log tail starting at byte offset $offset, line $line"); } # # Scan, and process, the alert log as long as the byte offset is valid # for the existing alert log. # my $lineNumber = 0; # line number of timestamp my $errStack = ""; # content of the current error stack my $errCodes = ""; # key for the current error stack my $traceFileName = ""; # name of trace file # # Go directly to EOF (Do not pass GO. Do not collect $200.) # Do not read past the end of the file (at the time we first looked). # seek(LOG, 0, 2); my $eof_offset = tell(LOG); if (EMAGENT_isPerlDebugEnabled()) { EMAGENT_PERL_DEBUG("EOF position is $eof_offset."); } # # Also, record the file's offset before the current line read, in case we # are time- or size-limited and need to reset the LOG file descriptor. # my $prev_offset; if ($offset <= $eof_offset) { # # A valid offset was provided, seek to it and process the lines. # seek(LOG, $offset, 0); $prev_offset = $offset; my $addContent; my $stack_offset = $offset; while () { $line++; # # Clean up the line (remove \n, leading, and trailing whitespace). # chop; $_ =~ s/^\s+//; $_ =~ s/\s+$//; # # Discontinue adding error stack content if we've hit the limit # on string size supported by EM. # $addContent = ($prev_offset - $stack_offset < 4000); # # Look for a new timestamp entry. # if ($_ =~ m/$log_timestamp_format/ && ×tampValid($&)) { # # Process error stack timestamp. # # # Check for a time or size limiting condition. # (Note: $prev_offset is a function of the current size of the # file, $eof_offset is the original size of the file; # we don't want to "chase" the end of the file when # it's growing at a rate faster than our scan rate!) # if (&outOfTime || $prev_offset >= $eof_offset) { # # "Unscan" the latest timestamp by backing up to the # previous offset and a single line. # $line--; seek(LOG, $prev_offset, 0); if (EMAGENT_isPerlDebugEnabled()) { EMAGENT_PERL_DEBUG("Stopped at " . Time::HiRes::time); } # # Break from the input loop BEFORE writing the error stack # (the last error stack is written after all lines to be # read are). # last; } # # Dump the existing errStack, if there is one. # if ($initialTimestamp && $errCodes) { &writeErrStack($initialTimestamp, $lineNumber, $errStack, $errCodes, $traceFileName); } # # Begin a new error stack. All tests passed, save valid # timestamp # $initialTimestamp = $&; $lineNumber = $line; $errCodes = ""; # reset information regarding the stack $errStack = ""; $traceFileName = &traceFileOf($_); $stack_offset = tell(LOG); $stacks_processed++; $prev_offset = $stack_offset; } else { # # Process error stack content. # # # Augment archiver errors so that they are identified # (appropriately) as ORA- errors. # s/(ARC[0-9]*: Error )([0-9]+[^0-9].*)/\1ORA\-\2/; if (!$traceFileName && $lineNumber + 1 == $line) { # # (Trace file identification is not reported in the stack) # $traceFileName = &traceFileOf($_); $addContent = ($traceFileName eq ""); # 5711362: Error stack for deadlock includes ORA-0*60 and # the trace file name on the same line. Inorder to report # deadlock at GC, we consider ORA-0*60 as an exception to # trace file skip from errStack. if ($traceFileName ne "") { if ($_ =~ m/$errorPrefix-(\d*)/) { my $error = $1; if ( ($errorPrefix eq "ORA") && ($error == 60) ) { $addContent = 1; } # bug fix 7422639 elsif ( $errorPrefix eq "CRS" ) { $addContent = 1; } } } } if ($addContent) { # # (Any other content is, as long as we haven't hit a limit) # if ($errStack eq "") { $errStack .= $_; } else { $errStack .= "~".$_; } my $errCode = &errorCodeOf($_); if ($errCode) { if ($errCodes) { $errCodes .= ", "; } $errCodes .= $errCode; } } $prev_offset = tell(LOG); } } # # Dump the final errStack. # if ($initialTimestamp && $errCodes) { &writeErrStack($initialTimestamp, $lineNumber, $errStack, $errCodes, $traceFileName); } } else { # # We know the offset is bogus, so alert log has been recycled; # offset will be computed later, but line count should be computed now. # seek(LOG, 0, 0); $line = 0; while () { $line++; } } } $offset = tell(LOG); # # Reopen scanned file and write out new offset (log size), line number, # and time # open (SCANNED, "+> $scannedFile") or die "em_error=Cannot open $scannedFile"; print SCANNED "$offset $line $initialTimestamp\n"; close(SCANNED); close(LOG); # # subroutine initializeColumnPatterns # Initialize the array of patterns that define non-generic error categories. # sub initializeColumnPatterns { $#columnPatterns = $ENV{EM_ALERTLOG_ERROR_TYPES} - 1; if (EMAGENT_isPerlDebugEnabled()) { EMAGENT_PERL_DEBUG("$ENV{EM_ALERTLOG_ERROR_TYPES} additional columns defined."); } for ($[+1..$#columnPatterns+1) { @columnPatterns[$_-1] = $ENV{"EM_ALERTLOG_ERROR_TYPE$_"}; if (!@columnPatterns[$_-1]) { EMAGENT_PERL_WARN("Column $_ contains no definition."); } else { # translate commas to regexp alternatives; drop whitespace @columnPatterns[$_-1] =~ tr/, /|/d; if (EMAGENT_isPerlDebugEnabled()) { EMAGENT_PERL_DEBUG("Column $_ matches errors @columnPatterns[$_-1]."); } } } } # # subroutine timestampValid # Check the validity of a timestamp. # # $_[0] - timestamp to validate # sub timestampValid { my $tsValid = 1; my @current = split (/\D/, $_[0]); # split out each number from timestamp my $currval = \@current; my $i = 0; my $maxval; foreach $maxval (@max) # verify each number is within valid range { if ($maxval != "") { while (ord(substr($currval->[$i], 0, 1)) < 48 && $i < scalar(@current)) { $i++; # skip null values within @current } if ($currval->[$i] > $maxval) { $tsValid = 0; # timestamp invalid...number too large } $currval->[$i] =~ s/0//; if ($currval->[$i] == "") { if (ord(substr($maxval, 0, 1)) > 48) { $tsValid = 0; # timestamp invalid...number = 0 } } $currval->[$i] = ""; } } $tsValid == 1; } # # subroutine outOfTime # Determine if we've bumped up against a time limit. # sub outOfTime { # # Check every 512 log entries to see if we've been time-limited # !($stacks_processed % 128) && $limitSwitch > 0 && Time::HiRes::time - $start_time >= $limitSwitch; } # # subroutine traceFileOf # Extract the trace file of the line. # # $_[0] - line to search for error codes in # sub traceFileOf { my $traceFilePortion; my $loc = $[; if (($loc = index($_[0], $backgroundDumpDest, $loc)) >= $[) { $traceFilePortion = substr($_[0], $loc); } elsif (($loc = index($_[0], $userDumpDest, $loc)) >= $[) { $traceFilePortion = substr($_[0], $loc); } else { $traceFilePortion = ""; } if ($traceFilePortion ne "") { if ($traceFilePortion =~ m/([^\s]*).*$/) { if (substr($traceFilePortion, length($1)-1) eq ":") { # # Remove trailing colon. # $traceFilePortion = substr($1, 0, length($1)-1); } } } $traceFilePortion; } # # subroutine errorCodeOf # Extract the error codes of the line. # # $_[0] - line to search for error codes in # sub errorCodeOf { my $error = ""; if ($_[0] =~ m/$errorPrefix-(\d*)/) { $error = $1; if ($errorPrefix eq "ORA" && ($error == 600 || $error == 7445) && $_[0] =~ m/^[^\[]*(\[[^\]]*\])/) { $error .= $1; } } $error; } # # subroutine writeErrStack # Write out the current error stack. Do nothing if we've seen this before. # # $_[0] - initialTimestamp # $_[1] - lineNumber # $_[2] - errStack # $_[3] - errCodes # $_[4] - traceFileName # sub writeErrStack { # # combine all similar error stacks # # to revert to only rolling up identical 600 and 7445 errors, # and (&&) the following condition to the first argument of grep: # # m/(600\[[^\]]*\])|(7445\[[^\]]*\])/ # my @matchedStackKeys = grep($_ eq $_[3], @errStackKeys); if ($#matchedStackKeys == ($[ - 1)) { push(@errStackKeys, $_[3]); # # Compose the output line (just in case it isn't ignored). # my $isGeneric = 1; my $outputLine = "$_[0]/$_[1]|"; # errStackTimestamp/# lineNumber my $errorColumns = "|"; if ($errorPrefix eq "CRS") # No generic column { $errorColumns = ""; } my $column = 0; my $errorPattern; while ($column < @columnPatterns) { # if we're still considering this error stack possibly generic and # an error list was supplied... if ($isGeneric && @columnPatterns[$column]) { $errorPattern = "$errorPrefix-(" . @columnPatterns[$column] . ")"; # ...attempt a match to reroute this error to a different # column. if ($_[2] =~ m/$errorPattern/) { $errorColumns .= $_[2]; $isGeneric = 0; } } $errorColumns .= "|"; $column++; } if (($errorPrefix ne "CRS") && $isGeneric) { $outputLine .= $_[2]; # generic error column } if ($errorPrefix ne "CRS") { $outputLine .= $errorColumns . # other error columns "$_[4]|$_[3]|$log_file_absolute\n"; # traceFileName, # errCodes, # fileName # # Write the line when either # - there is no pattern to ignore or # - this is not a generic error stack or # - the generic error stack does not match the ignore pattern # if (!$ignorePattern || !$isGeneric || ($_[2] !~ m/$ignorePattern/)) { # write the timestamp as the format e.g 2008-11-25 02:06:22 # put it as the last column my ($year, $month, $day, $hour, $min, $sec) = HTTP::Date::parse_date($_[0]); my $agentTimestamp = $year . "-" . $month . "-" . $day . " ". $hour . ":" . $min . ":" . $sec ; $outputLine = "em_result=" .$agentTimestamp . "|" . $outputLine; print $outputLine; EMD_PERL_DEBUG($outputLine); } } else ## for CRS alert log, no traceFileName { $outputLine .= $errorColumns . # other error columns "$_[3]|$log_file_absolute\n"; # errCodes, # fileName # # Write the line when it is not generic if (!$isGeneric) { $outputLine = "em_result=" . $outputLine; print $outputLine; EMD_PERL_DEBUG($outputLine); } } } } exit 0;