// detector.c written by Mitchell Foral. mitchell<att>caladbolg.net.
// See COPYING for license information.
#include <ctype.h>
#include <magic.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "detector.h"
#include "languages.h"
#include "log.h"
#include "hash/cppheader_hash.h"
#include "hash/disambiguatefunc_hash.h"
#include "hash/extension_hash.h"
#include "hash/filename_hash.h"
#define ISBINARY(x) (x[0] == '\1')
#define ISAMBIGUOUS(x) (x[0] == '\2')
#define DISAMBIGUATEWHAT(x) &x[1]
#ifdef _WIN32
# include <fcntl.h>
# define mkstemp(p) _open(_mktemp(p), _O_CREAT | _O_SHORT_LIVED | _O_EXCL)
#endif
/* Parse the output of libmagic and return a language, if any.
* The contents of string `line` will be destroyed.
*/
const char *magic_parse(char *line) {
char *p, *pe;
char *eol = line + strlen(line);
char buf[80];
size_t length;
for (p = line; p < eol; p++) *p = tolower(*p);
p = strstr(line, "script text"); // Example: "script text executable for perl -w,"
if (p && p == line) {
p = strstr(line, "for ");
if (p) {
p += 4;
pe = p;
while (isalnum(*pe)) pe++;
length = pe - p;
strncpy(buf, p, length);
buf[length] = '\0';
struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
if (rl) return(rl->name);
}
}
p = strstr(line, "script"); // Example: "PHP script, ASCII text"
if (p) {
do {
p--;
pe = p;
while (*p == ' ') p--;
while (p != line && isalnum(*(p - 1))) p--;
if (p != line && *(p - 1) == '-') p--;
} while (*p == '-'); // Skip over any switches.
length = pe - p;
strncpy(buf, p, length);
buf[length] = '\0';
struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
if (rl) return(rl->name);
} else if (strstr(line, "xml")) return(LANG_XML);
return NULL;
}
/* Use libmagic to detect file language
*/
const char *detect_language_magic(SourceFile *sourcefile) {
char line[80];
magic_t cookie = magic_open(MAGIC_NONE);
if (cookie == NULL) {
fprintf(stderr, "libmagic: %s\n", magic_error(cookie));
exit(1);
}
if (magic_load(cookie, NULL) != 0) {
fprintf(stderr, "libmagic: %s\n", magic_error(cookie));
magic_close(cookie);
exit(1);
}
if (sourcefile->diskpath) {
const char *magic = magic_file(cookie, sourcefile->diskpath);
if (magic == NULL) {
fprintf(stderr, "libmagic: %s\n", magic_error(cookie));
magic_close(cookie);
exit(1);
}
strncpy(line, magic, sizeof(line));
line[sizeof(line)-1] = '\0';
} else {
char *p = ohcount_sourcefile_get_contents(sourcefile);
if (!p) return NULL;
const char *magic = magic_buffer(cookie, p, strlen(p));
if (magic == NULL) {
fprintf(stderr, "libmagic: %s\n", magic_error(cookie));
magic_close(cookie);
exit(1);
}
strncpy(line, magic, sizeof(line));
line[sizeof(line)-1] = '\0';
}
magic_close(cookie);
return magic_parse(line);
}
/* Use all available means to detect file language
*/
const char *ohcount_detect_language(SourceFile *sourcefile) {
const char *language = NULL;
char *p, *pe;
int length;
// Attempt to detect based on file extension.
length = strlen(sourcefile->ext);
struct ExtensionMap *re = ohcount_hash_language_from_ext(sourcefile->ext,
length);
if (re) language = re->value;
if (!language) {
// Try the lower-case version of this extension.
char lowerext[length + 1];
strncpy(lowerext, sourcefile->ext, length);
lowerext[length] = '\0';
for (p = lowerext; p < lowerext + length; p++) *p = tolower(*p);
struct ExtensionMap *re = ohcount_hash_language_from_ext(lowerext, length);
if (re) language = re->value;
}
// Attempt to detect using Emacs mode line (/^-\*-\s*mode[\s:]*\w/i).
if(!language) {
char line[81] = { '\0' }, buf[81];
p = ohcount_sourcefile_get_contents(sourcefile);
pe = p;
char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
while (pe < eof) {
// Get the contents of the first line.
while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
strncpy(line, p, length);
line[length] = '\0';
if (*line == '#' && *(line + 1) == '!') {
// First line was sh-bang; loop to get contents of second line.
while (*pe == '\r' || *pe == '\n') pe++;
p = pe;
} else break;
}
p = strstr(line, "-*-");
if (p) {
p += 3;
while (*p == ' ' || *p == '\t') p++;
// detect "mode" (any capitalization)
if (strncasecmp(p, "mode", 4) == 0) {
p += 4;
while (*p == ' ' || *p == '\t' || *p == ':') p++;
}
pe = p;
while (!isspace(*pe) && *pe != ';' && pe != strstr(pe, "-*-")) pe++;
length = (pe - p <= sizeof(buf)) ? pe - p : sizeof(buf);
strncpy(buf, p, length);
buf[length] = '\0';
// Special case for "c" or "C" emacs mode header: always means C, not C++
if (strcasecmp(buf, "c") == 0) {
return LANG_C;
}
// First try it with the language name.
struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
if (rl) language = rl->name;
if(!language) {
// Then try it with the extension table.
struct ExtensionMap *re = ohcount_hash_language_from_ext(buf, length);
if (re) language = re->value;
}
if (!language) {
// Try the lower-case version of this modeline.
for (pe = buf; pe < buf+length; pe++) *pe = tolower(*pe);
// First try it with the language name.
rl = ohcount_hash_language_from_name(buf, length);
if (rl) language = rl->name;
}
if (!language) {
// Then try it with the extension table.
struct ExtensionMap *re = ohcount_hash_language_from_ext(buf, length);
if (re) language = re->value;
}
}
}
// Attempt to detect based on filename.
if(!language) {
length = strlen(sourcefile->filename);
struct FilenameMap *rf =
ohcount_hash_language_from_filename(sourcefile->filename, length);
if (rf) language = rf->value;
}
// Attempt to detect based on Unix 'file' command.
if(!language) {
language = detect_language_magic(sourcefile);
}
if (language) {
if (ISAMBIGUOUS(language)) {
// Call the appropriate function for disambiguation.
length = strlen(DISAMBIGUATEWHAT(language));
struct DisambiguateFuncsMap *rd =
ohcount_hash_disambiguate_func_from_id(DISAMBIGUATEWHAT(language),
length);
if (rd) language = rd->value(sourcefile);
} else language = ISBINARY(language) ? NULL : language;
}
return language;
}
const char *disambiguate_aspx(SourceFile *sourcefile) {
char *p = ohcount_sourcefile_get_contents(sourcefile);
char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
for (; p < eof; p++) {
// /<%@\s*Page[^>]+Language="VB"[^>]+%>/
p = strstr(p, "<%@");
if (!p)
break;
char *pe = strstr(p, "%>");
if (p && pe) {
p += 3;
const int length = pe - p;
char buf[length];
strncpy(buf, p, length);
buf[length] = '\0';
char *eol = buf + strlen(buf);
for (p = buf; p < eol; p++) *p = tolower(*p);
p = buf;
while (*p == ' ' || *p == '\t') p++;
if (strncmp(p, "page", 4) == 0) {
p += 4;
if (strstr(p, "language=\"vb\""))
return LANG_VB_ASPX;
}
}
}
return LANG_CS_ASPX;
}
// 6502 assembly or XML-based Advanced Stream Redirector ?
const char *disambiguate_asx(SourceFile *sourcefile) {
char *p = ohcount_sourcefile_get_contents(sourcefile);
char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
for (; p < eof; p++) {
switch (*p) {
case ' ':
case '\t':
case '\n':
case '\r':
break;
case '<':
case '\0':
// byte-order marks:
case (char) 0xef:
case (char) 0xfe:
case (char) 0xff:
return NULL; // XML
default:
return LANG_ASSEMBLER;
}
}
return LANG_ASSEMBLER; // only blanks - not valid XML, may be valid asm
}
const char *disambiguate_b(SourceFile *sourcefile) {
char *p = ohcount_sourcefile_get_contents(sourcefile);
char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
while (p < eof) {
// /(implement[ \t])|(include[ \t]+"[^"]*";)|
// ((return|break|continue).*;|(pick|case).*\{)/
if (strncmp(p, "implement", 9) == 0 &&
(*(p + 9) == ' ' || *(p + 9) == '\t'))
return LANG_LIMBO;
else if (strncmp(p, "include", 7) == 0 &&
(*(p + 7) == ' ' || *(p + 7) == '\t')) {
p += 7;
while (*p == ' ' || *p == '\t') p++;
if (*p == '"') {
while (*p != '"' && p < eof) p++;
if (*p == '"' && *(p + 1) == ';')
return LANG_LIMBO;
}
} else if (strncmp(p, "return", 6) == 0 ||
strncmp(p, "break", 5) == 0 ||
strncmp(p, "continue", 8) == 0) {
if (strstr(p, ";"))
return LANG_LIMBO;
} else if (strncmp(p, "pick", 4) == 0 ||
strncmp(p, "case", 4) == 0) {
if (strstr(p, "{"))
return LANG_LIMBO;
}
p++;
}
return disambiguate_basic(sourcefile);
}
const char *disambiguate_basic(SourceFile *sourcefile) {
char *p, *pe;
int length;
// Attempt to detect based on file contents.
char line[81];
p = ohcount_sourcefile_get_contents(sourcefile);
pe = p;
char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
while (pe < eof) {
// Get a line at a time.
while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
strncpy(line, p, length);
line[length] = '\0';
char *line_end = pe;
p = line;
if (isdigit(*p)) {
// /^\d+\s+\w/
p++;
while (isdigit(*p)) p++;
if (*p == ' ' || *p == '\t') {
p++;
while (*p == ' ' || *p == '\t') p++;
if (isalnum(*p))
return LANG_CLASSIC_BASIC;
}
}
// Next line.
pe = line_end;
while (*pe == '\r' || *pe == '\n') pe++;
p = pe;
}
// Attempt to detect from associated VB files in file context.
char **filenames = sourcefile->filenames;
if (filenames) {
int i;
for (i = 0; filenames[i] != NULL; i++) {
pe = filenames[i] + strlen(filenames[i]);
p = pe;
while (p > filenames[i] && *(p - 1) != '.') p--;
length = pe - p;
if (length == 3 &&
(strncmp(p, "frm", length) == 0 ||
strncmp(p, "frx", length) == 0 ||
strncmp(p, "vba", length) == 0 ||
strncmp(p, "vbp", length) == 0 ||
strncmp(p, "vbs", length) == 0)) {
return LANG_VISUALBASIC;
}
}
}
return LANG_STRUCTURED_BASIC;
}
const char *disambiguate_cs(SourceFile *sourcefile) {
// Attempt to detect based on file contents.
char *contents = ohcount_sourcefile_get_contents(sourcefile);
if (contents && strstr(contents, "<?cs"))
return LANG_CLEARSILVER_TEMPLATE;
else
return LANG_CSHARP;
}
const char *disambiguate_dat(SourceFile *sourcefile) {
char *p = ohcount_sourcefile_get_contents(sourcefile);
char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
for (; p < eof; p++) {
switch (*p) {
case ' ':
case '\t':
case '\n':
case '\r':
break;
case '/':
if (p[1] == '*') // AMPL comment
return LANG_AMPL;
return NULL;
case '#':
return LANG_AMPL; // AMPL comment
case 'd':
if (strncmp(p, "data", 4) == 0) // AMPL data statement
return LANG_AMPL;
return BINARY;
case 'p':
if (strncmp(p, "param", 5) == 0) // AMPL param statement
return LANG_AMPL;
return BINARY;
case 's':
if (strncmp(p, "set", 3) == 0) // AMPL set statement
return LANG_AMPL;
return BINARY;
default:
return BINARY;
}
}
return NULL; // only blanks
}
const char *disambiguate_def(SourceFile *sourcefile) {
char *p = ohcount_sourcefile_get_contents(sourcefile);
char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
for (; p < eof; p++) {
switch (*p) {
case ' ':
case '\t':
case '\n':
case '\r':
break;
case '(':
if (p[1] == '*') // Modula-2 comment
return LANG_MODULA2;
return NULL;
case 'D':
if (strncmp(p, "DEFINITION", 10) == 0) // Modula-2 "DEFINITION MODULE"
return LANG_MODULA2;
return NULL;
default:
return NULL; // not Modula-2
}
}
return NULL; // only blanks
}
const char *disambiguate_fs(SourceFile *sourcefile) {
/* .fs could be Forth or F# */
char *contents = ohcount_sourcefile_get_contents(sourcefile);
if (contents == NULL)
return NULL;
char *p = contents;
char c = *p;
long forthcount=0;
long fsharpcount=0;
while (c != '\0') {
while (c == ' ' || c == '\t')
c = *++p;
if (strncmp(p,"\\ ",2)==0) forthcount++;
else if (strncmp(p,": ",2)==0) forthcount++;
else if (strncmp(p,"|",1)==0) fsharpcount++;
else if (strncmp(p,"let ",4)==0) fsharpcount++;
else if (strncmp(p,"type ",5)==0) fsharpcount++;
else if (strncmp(p,"//",2)==0) fsharpcount++;
while (c != '\0' && c != '\n' && c != '\r')
c = *++p;
while (c == '\n' || c == '\r')
c = *++p;
}
if (forthcount > fsharpcount)
return LANG_FORTH;
else if (forthcount < fsharpcount)
return LANG_FSHARP;
else
return NULL;
}
const char *disambiguate_fortran(SourceFile *sourcefile) {
char *p;
p = ohcount_sourcefile_get_contents(sourcefile);
char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
// Try the assumption of a fixed formatted source code, and return free
// format if anything opposes this assumption.
// Rules based on the Fortran standard, page 47:
// ftp://ftp.nag.co.uk/sc22wg5/N1801-N1850/N1830.pdf
while (p < eof) {
int i = 1;
int blanklabel;
// Process a single line; tabulators are not valid in Fortran code
// but some compilers accept them to skip the first 5 columns.
if (*p == ' ' || *p == '\t' || isdigit(*p)) {
// Only consider lines starting with a blank or digit
// (non-comment in fixed)
if (*p == '\t') i = 5;
blanklabel = (*p == ' ' || *p == '\t');
while (*p != '\r' && *p != '\n' && p < eof) {
p++; i++;
if (i <= 5) {
blanklabel = blanklabel && (*p == ' ');
if ( !isdigit(*p) && *p != ' ' && *p != '!')
// Non-digit, non-blank, non-comment character in the label field
// definetly not valid fixed formatted code!
return LANG_FORTRANFREE;
}
if ((i == 6) && !blanklabel && *p != ' ' && *p != '0')
// Fixed format continuation line with non-blank label field
// not allowed, assume free format:
return LANG_FORTRANFREE;
// Ignore comments (a ! character in column 6 is a continuation in
// fixed form)
if (*p == '!' && i != 6) {
while (*p != '\r' && *p != '\n' && p < eof) p++;
} else {
// Ignore quotes
if (*p == '"') {
if (p < eof) {p++; i++;}
while (*p != '"' && *p != '\r' && *p != '\n' && p < eof) {
p++; i++;
}
}
if (*p == '\'') {
if (p < eof) {p++; i++;}
while (*p != '\'' && *p != '\r' && *p != '\n' && p < eof) {
p++; i++;
}
}
// Check for free format line continuation
if (i > 6 && i <= 72 && *p == '&')
// Found an unquoted free format continuation character in the fixed
// format code section. This has to be free format.
return LANG_FORTRANFREE;
}
}
} else {
// Not a statement line in fixed format...
if (*p != 'C' && *p != 'c' && *p != '*' && *p != '!')
// Not a valid fixed form comment, has to be free formatted source
return LANG_FORTRANFREE;
// Comment in fixed form, ignore this line
while (*p != '\r' && *p != '\n' && p < eof) p++;
}
// Skip all line ends
while ((*p == '\r' || *p == '\n') && p < eof) p++;
}
// Assume fixed format if none of the lines broke the assumptions
return LANG_FORTRANFIXED;
}
const char *disambiguate_h(SourceFile *sourcefile) {
char *p, *pe, *bof;
int length;
// If the directory contains a matching *.m file, likely Objective C.
length = strlen(sourcefile->filename);
if (strcmp(sourcefile->ext, "h") == 0) {
char path[length];
strncpy(path, sourcefile->filename, length);
path[length] = '\0';
*(path + length - 1) = 'm';
char **filenames = sourcefile->filenames;
if (filenames) {
int i;
for (i = 0; filenames[i] != NULL; i++)
if (strcmp(path, filenames[i]) == 0)
return LANG_OBJECTIVE_C;
}
}
// Attempt to detect based on file contents.
char line[81], buf[81];
bof = ohcount_sourcefile_get_contents(sourcefile);
p = bof;
pe = p;
char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
while (pe < eof) {
// Get a line at a time.
while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
strncpy(line, p, length);
line[length] = '\0';
char *eol = line + strlen(line);
char *line_end = pe;
// Look for C++ headers.
if (*line == '#') {
p = line + 1;
while (*p == ' ' || *p == '\t') p++;
if (strncmp(p, "include", 7) == 0 &&
(*(p + 7) == ' ' || *(p + 7) == '\t')) {
// /^#\s*include\s+[<"][^>"]+[>"]/
p += 8;
while (*p == ' ' || *p == '\t') p++;
if (*p == '<' || *p == '"') {
// Is the header file a C++ header file?
p++;
pe = p;
while (pe < eol && *pe != '>' && *pe != '"') pe++;
length = pe - p;
strncpy(buf, p, length);
buf[length] = '\0';
if (ohcount_hash_is_cppheader(buf, length))
return LANG_CPP;
// Is the extension for the header file a C++ file?
p = pe;
while (p > line && *(p - 1) != '.') p--;
length = pe - p;
strncpy(buf, p, length);
buf[length] = '\0';
struct ExtensionMap *re = ohcount_hash_language_from_ext(buf, length);
if (re && strcmp(re->value, LANG_CPP) == 0)
return LANG_CPP;
}
}
}
// Look for C++ keywords.
p = line;
while (p < eol) {
if (islower(*p) && p != bof && !isalnum(*(p - 1)) && *(p - 1) != '_') {
pe = p;
while (islower(*pe)) pe++;
if (!isalnum(*pe) && *pe != '_') {
length = pe - p;
strncpy(buf, p, length);
buf[length] = '\0';
if (strcmp(buf, "class") == 0 ||
strcmp(buf, "namespace") == 0 ||
strcmp(buf, "template") == 0 ||
strcmp(buf, "typename") == 0)
return LANG_CPP;
}
p = pe + 1;
} else p++;
}
// Next line.
pe = line_end;
while (*pe == '\r' || *pe == '\n') pe++;
p = pe;
}
// Nothing to suggest C++.
return LANG_C;
}
const char *disambiguate_in(SourceFile *sourcefile) {
char *p, *pe;
int length;
const char *language = NULL;
p = sourcefile->filepath;
pe = p + strlen(p) - 3;
if (strstr(p, ".") <= pe) {
// Only if the filename has an extension prior to the .in
length = pe - p;
char buf[length];
strncpy(buf, p, length);
buf[length] = '\0';
p = ohcount_sourcefile_get_contents(sourcefile);
if (!p) {
return NULL;
}
// A SourceFile's filepath and diskpath need not be the same.
// Here, we'll take advantage of this to set up a new SourceFile
// whose filepath does not have the *.in extension, but whose
// diskpath still points back to the original file on disk (if any).
SourceFile *undecorated = ohcount_sourcefile_new(buf);
if (sourcefile->diskpath) {
ohcount_sourcefile_set_diskpath(undecorated, sourcefile->diskpath);
}
ohcount_sourcefile_set_contents(undecorated, p);
undecorated->filenames = sourcefile->filenames;
language = ohcount_sourcefile_get_language(undecorated);
ohcount_sourcefile_free(undecorated);
}
return language;
}
const char *disambiguate_inc(SourceFile *sourcefile) {
char *p = ohcount_sourcefile_get_contents(sourcefile);
if (p) {
char *eof = p + strlen(p);
while (p < eof) {
if (*p == '\0')
return BINARY;
else if (*p == '?' && strncmp(p + 1, "php", 3) == 0)
return LANG_PHP;
p++;
}
}
return NULL;
}
const char *disambiguate_m(SourceFile *sourcefile) {
char *p, *pe;
int length;
// Attempt to detect based on a weighted heuristic of file contents.
int mathematica_score = 0;
int matlab_score = 0;
int objective_c_score = 0;
int limbo_score = 0;
int octave_syntax_detected = 0;
int i, has_h_headers = 0, has_c_files = 0;
char **filenames = sourcefile->filenames;
if (filenames) {
for (i = 0; filenames[i] != NULL; i++) {
p = filenames[i];
pe = p + strlen(p);
if (pe - p >= 4) {
if (*(pe - 4) == '.' && *(pe - 3) == 'c' &&
((*(pe - 2) == 'p' && *(pe - 1) == 'p') ||
(*(pe - 2) == '+' && *(pe - 1) == '+') ||
(*(pe - 2) == 'x' && *(pe - 1) == 'x'))) {
has_c_files = 1;
break; // short circuit
}
} else if (pe - p >= 3) {
if (*(pe - 3) == '.' && *(pe - 2) == 'c' && *(pe - 1) == 'c') {
has_c_files = 1;
break; // short circuit
}
} else if (pe - p >= 2) {
if (*(pe - 2) == '.') {
if (*(pe - 1) == 'h')
has_h_headers = 1;
else if (*(pe - 1) == 'c' || *(pe - 1) == 'C') {
has_c_files = 1;
break; // short circuit
}
}
}
}
}
if (has_h_headers && !has_c_files)
objective_c_score += 5;
char line[81], buf[81];
p = ohcount_sourcefile_get_contents(sourcefile);
pe = p;
char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
while (pe < eof) {
// Get a line at a time.
while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
strncpy(line, p, length);
line[length] = '\0';
char *eol = line + strlen(line);
char *line_end = pe;
// Look for tell-tale lines.
p = line;
while (*p == ' ' || *p == '\t') p++;
if (*p == '%') { // Matlab comment
matlab_score++;
} else if (*p == '#' && strncmp(p, "#import", 7) == 0) { // Objective C
objective_c_score++;
} else if (*p == '#') { // Limbo or Octave comment
while (*p == '#') p++;
if (*p == ' ' || *p == '\t') {
limbo_score++;
matlab_score++;
octave_syntax_detected = 1;
}
} else if (*p == '/' && *(p + 1) == '/' || *p == '/' && *(p + 1) == '*') {
objective_c_score++; // Objective C comment
} else if (*p == '+' || *p == '-') { // Objective C method signature
objective_c_score++;
} else if (*p == '@' || *p == '#') { // Objective C method signature
if (strncmp(p, "@implementation", 15) == 0 ||
strncmp(p, "@interface", 10) == 0)
objective_c_score++;
} else if (strncmp(p, "function", 8) == 0) { // Matlab or Octave function
p += 8;
while (*p == ' ' || *p == '\t') p++;
if (*p == '(')
matlab_score++;
} else if (strncmp(p, "include", 7) == 0) { // Limbo include
// /^include[ \t]+"[^"]+\.m";/
p += 7;
if (*p == ' ' || *p == '\t') {
while (*p == ' ' || *p == '\t') p++;
if (*p == '"') {
while (*p != '"' && p < eol) p++;
if (*p == '"' && *(p - 2) == '.' && *(p - 1) == 'm')
limbo_score++;
}
}
}
// Look for Octave keywords.
p = line;
while (p < eol) {
if (islower(*p) && p != line && !isalnum(*(p - 1))) {
pe = p;
while (islower(*pe) || *pe == '_') pe++;
if (!isalnum(*pe)) {
length = pe - p;
strncpy(buf, p, length);
buf[length] = '\0';
if (strcmp(buf, "end_try_catch") == 0 ||
strcmp(buf, "end_unwind_protect") == 0 ||
strcmp(buf, "endfunction") == 0 ||
strcmp(buf, "endwhile") == 0)
octave_syntax_detected = 1;
}
p = pe + 1;
} else p++;
}
// Look for Limbo declarations
p = line;
while (p < eol) {
if (*p == ':' && (*(p + 1) == ' ' || *(p + 1) == '\t')) {
// /:[ \t]+(module|adt|fn ?\(|con[ \t])/
p += 2;
if (strncmp(p, "module", 6) == 0 && !isalnum(*(p + 6)) ||
strncmp(p, "adt", 3) == 0 && !isalnum(*(p + 3)) ||
strncmp(p, "fn", 2) == 0 &&
(*(p + 2) == ' ' && *(p + 3) == '(' || *(p + 2) == '(') ||
strncmp(p, "con", 3) == 0 &&
(*(p + 3) == ' ' || *(p + 3) == '\t'))
limbo_score++;
} else p++;
}
// Look for Mathematica pattern definitions
p = line;
while (p < eol) {
// & as postfix operator
if (*p == '&') {
p++;
while (*p == ' ' || *p == '\t') p++;
if (*p == ',' || *p == ')' || *p == ']') mathematica_score++;
}
// Mathematica comment
if (*p == '(' && *(p + 1) == '*') mathematica_score++;
// some Mathematica operators
if (*p == '/' && *(p + 1) == '.') mathematica_score++;
if (*p == '_' && *(p + 1) == '_') mathematica_score++;
if (*p == '@' && *(p + 1) == '@') mathematica_score++;
p++;
}
// Next line.
pe = line_end;
while (*pe == '\r' || *pe == '\n') pe++;
p = pe;
}
if (limbo_score > objective_c_score &&
limbo_score > matlab_score &&
limbo_score > mathematica_score)
return LANG_LIMBO;
else if (objective_c_score > matlab_score &&
objective_c_score > mathematica_score)
return LANG_OBJECTIVE_C;
else if (mathematica_score > matlab_score)
return LANG_MATHEMATICA;
else
return octave_syntax_detected ? LANG_OCTAVE : LANG_MATLAB;
}
const char *disambiguate_mod(SourceFile *sourcefile) {
char *p = ohcount_sourcefile_get_contents(sourcefile);
char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
for (; p < eof; p++) {
switch (*p) {
case ' ':
case '\t':
case '\n':
case '\r':
break;
case '(':
if (p[1] == '*') // Modula-2 comment
return LANG_MODULA2;
return NULL;
case 'I':
if (strncmp(p, "IMPLEMENTATION", 14) == 0) // Modula-2 "IMPLEMENTATION MODULE"
return LANG_MODULA2;
return NULL;
case 'M':
if (strncmp(p, "MODULE", 6) == 0) // Modula-2 "MODULE"
return LANG_MODULA2;
return NULL;
default:
return LANG_AMPL;
}
}
return NULL; // only blanks
}
#include <pcre.h>
// strnlen is not available on OS X, so we roll our own
size_t mystrnlen(const char *begin, size_t maxlen) {
if (begin == NULL)
return 0;
const char *end = memchr(begin, '\0', maxlen);
return end ? (end - begin) : maxlen;
}
const char *disambiguate_pp(SourceFile *sourcefile) {
char *p = ohcount_sourcefile_get_contents(sourcefile);
if (!p)
return NULL;
/* prepare regular expressions */
const char *error;
int erroffset;
/* try harder with optional spaces */
pcre *keyword;
keyword = pcre_compile("^\\s*(ensure|content|notify|require|source)\\s+=>",
PCRE_MULTILINE, &error, &erroffset, NULL);
if (pcre_exec(keyword, NULL, p, mystrnlen(p, 10000), 0, 0, NULL, 0) > -1)
return LANG_PUPPET;
/* check for standard puppet constructs */
pcre *construct;
construct = pcre_compile("^\\s*(define\\s+[\\w:-]+\\s*\\(|class\\s+[\\w:-]+(\\s+inherits\\s+[\\w:-]+)?\\s*[\\({]|node\\s+\\'?[\\w:\\.-]+\\'?\\s*{|import\\s+\"|include\\s+[\"']?[\\w:-][\"']?)",
PCRE_MULTILINE, &error, &erroffset, NULL);
if (pcre_exec(construct, NULL, p, mystrnlen(p, 10000), 0, 0, NULL, 0) > -1)
return LANG_PUPPET;
return LANG_PASCAL;
}
const char *disambiguate_pl(SourceFile *sourcefile) {
char *contents = ohcount_sourcefile_get_contents(sourcefile);
if (!contents)
return NULL;
// Check for a perl shebang on first line of file
const char *error;
int erroffset;
pcre *re = pcre_compile("#![^\\n]*perl", PCRE_CASELESS, &error, &erroffset, NULL);
if (pcre_exec(re, NULL, contents, mystrnlen(contents, 100), 0, PCRE_ANCHORED, NULL, 0) > -1)
return LANG_PERL;
// Check for prolog :- rules
if (strstr(contents, ":- ") || strstr(contents, ":-\n"))
return LANG_PROLOG;
// Perl by default.
return LANG_PERL;
}
#define QMAKE_SOURCES_SPACE "SOURCES +="
#define QMAKE_SOURCES "SOURCES+="
#define QMAKE_CONFIG_SPACE "CONFIG +="
#define QMAKE_CONFIG "CONFIG+="
const char *disambiguate_pro(SourceFile *sourcefile) {
char *p = ohcount_sourcefile_get_contents(sourcefile);
char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
for (; p < eof; p++) {
if (strncmp(p, QMAKE_SOURCES_SPACE, strlen(QMAKE_SOURCES_SPACE)) == 0 ||
strncmp(p, QMAKE_SOURCES, strlen(QMAKE_SOURCES)) == 0 ||
strncmp(p, QMAKE_CONFIG_SPACE, strlen(QMAKE_CONFIG_SPACE)) == 0 ||
strncmp(p, QMAKE_CONFIG, strlen(QMAKE_CONFIG)) == 0)
return LANG_MAKE; // really QMAKE
}
return LANG_IDL_PVWAVE;
}
const char *disambiguate_r(SourceFile *sourcefile) {
char *contents = ohcount_sourcefile_get_contents(sourcefile);
if (!contents)
return LANG_R;
char *eof = contents + ohcount_sourcefile_get_contents_size(sourcefile);
// Detect REBOL by looking for the occurence of "rebol" in the contents
// (case-insensitive). Correct REBOL scripts have a "REBOL [...]" header
// block.
char *needle = "rebol";
int len = strlen(needle);
for (; contents < eof - len; ++contents)
if (tolower(*contents) == *needle &&
!strncasecmp(contents, needle, len))
return LANG_REBOL;
return LANG_R;
}
const char *disambiguate_rs(SourceFile *sourcefile) {
// .rs is normally Rust, but it might be RenderScript. RenderScript is
// expected to have a "#pragma version(1)" line at the very start, possibly
// after comments. To help with supporting future versions of RenderScript,
// we'll skip the number part.
// As RenderScript is not implemented in ohcount yet, it's returned as NULL.
char *contents = ohcount_sourcefile_get_contents(sourcefile);
if (!contents) {
return LANG_RUST;
}
char *needle = "\n#pragma version";
int len = strlen(needle);
if (strncasecmp(contents, needle + 1, len - 1) == 0) {
// "#pragma version" at the very start of the file is RenderScript.
return NULL;
}
char *eof = contents + ohcount_sourcefile_get_contents_size(sourcefile);
for (; contents < eof - len; ++contents) {
if (!strncmp(contents, needle, len)) {
return NULL;
}
}
return LANG_RUST;
}
const char *disambiguate_st(SourceFile *sourcefile) {
char *p, *pe;
int length;
// Attempt to detect based on file contents.
int found_assignment = 0, found_block_start = 0, found_block_end = 0;
char line[81];
p = ohcount_sourcefile_get_contents(sourcefile);
pe = p;
char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
while (pe < eof) {
// Get a line at a time.
while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
strncpy(line, p, length);
line[length] = '\0';
char *eol = line + strlen(line);
char *line_end = pe;
for (p = line; p < eol; p++) {
if (*p == ':') {
p++;
while (p < eol && (*p == ' ' || *p == '\t')) p++;
if (*p == '=')
found_assignment = 1;
else if (*p == '[')
found_block_start = 1;
} else if (*p == ']' && *(p + 1) == '.') found_block_end = 1;
if (found_assignment && found_block_start && found_block_end)
return LANG_SMALLTALK;
}
// Next line.
pe = line_end;
while (*pe == '\r' || *pe == '\n') pe++;
p = pe;
}
return NULL;
}
int ohcount_is_binary_filename(const char *filename) {
char *p = (char *)filename + strlen(filename);
while (p > filename && *(p - 1) != '.') p--;
if (p > filename) {
struct ExtensionMap *re;
int length = strlen(p);
re = ohcount_hash_language_from_ext(p, length);
if (re) return ISBINARY(re->value);
// Try the lower-case version of this extension.
char lowerext[length];
strncpy(lowerext, p, length);
lowerext[length] = '\0';
for (p = lowerext; p < lowerext + length; p++) *p = tolower(*p);
re = ohcount_hash_language_from_ext(lowerext, length);
if (re) return ISBINARY(re->value);
}
return 0;
}