Codebase list html2text / debian/1.3.2a-9 urlistream.C
debian/1.3.2a-9

Tree @debian/1.3.2a-9 (Download .tar.gz)

urlistream.C @debian/1.3.2a-9raw · history · blame

 /***************************************************************************/

/*
 * Portions Copyright (c) 1999 GMRS Software GmbH
 * Carl-von-Linde-Str. 38, D-85716 Unterschleissheim, http://www.gmrs.de
 * All rights reserved.
 *
 * Author: Arno Unkrig <arno@unkrig.de>
 */
 
/* This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License in the file COPYING for more details.
 */

 /***************************************************************************/

/*
 * Changes to version 1.2.2 were made by Martin Bayer <mbayer@zedat.fu-berlin.de>
 * Dates and reasons of modifications:
 * Thu Oct  4 21:49:09 CEST 2001: ported to g++ 3.0
 * Sun Apr  7 11:59:03 CEST 2002: Handle URLs with missing node
 * Mon Jul 22 13:53:02 CEST 2002: Made finaly reading from STDIN work.
 */
  
 /***************************************************************************/


#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <fcntl.h>
#include <unistd.h>
#ifdef SYS_POLL_MISSING /* { */
struct pollfd {
  int fd;           /* file descriptor */
  short events;     /* requested events */
  short revents;    /* returned events */
};
extern "C" int poll(struct pollfd *ufds, unsigned int nfds, int timeout);
#define POLLIN      0x0001    /* There is data to read */
#define POLLPRI     0x0002    /* There is urgent data to read */
#define POLLOUT     0x0004    /* Writing now will not block */
#define POLLERR     0x0008    /* Error condition */
#define POLLHUP     0x0010    /* Hung up */
#define POLLNVAL    0x0020    /* Invalid request: fd not open */
#else /* } { */
#include <sys/poll.h>
#endif /* } */
#include <netinet/in.h>
#include <arpa/inet.h>
#include <netdb.h>
#include <signal.h>

#include "urlistream.h"

using std::ios;

/* ------------------------------------------------------------------------- */

/*
 * Compile with
 *
 *     CC -DTESTING urlistream.C -o urlcat
 */

/* ------------------------------------------------------------------------- */

void
urlistream::open(
  const char *url,
  int        timeout /* = default_timeout */  // Milliseconds
)
{
  if (is_open()) close();

  /*
   * "http:" <address> URL?
   */
  if (!memcmp(url, "http:", 5)) {
    http_open(url + 5, timeout);
    return;
  }

  /*
   * "file:" <file-name> URL?
   */
  if (!memcmp(url, "file:", 5)) {
    file_open(url + 5);
    return;
  }

  /*
   * is the URL a bare file name?
   */
  if (strchr(url, ':') == NULL) {
    file_open(url);
    return;
  }

  open_error_ = "Unknown protocol (only \"file:\" and \"http:\" allowed)";
}

void
urlistream::open(
  const string &url,
  int          timeout /* = default_timeout */  // Milliseconds
)
{
  open(url.c_str(), timeout);
}

/* ------------------------------------------------------------------------- */

void
urlistream::http_open(
  const char *address,    // The URL portion after "http:"
  int        timeout      // Milliseconds
)
{

  /*
   * Break up the HTTP address:
   *
   *   "//" <host> [ ":" <port> ] <node>
   *
   * A missing node is interpreted as node "/" - Arno
   */
  char host_name[100];
  char port_name[100];
  char node_name[1000];
  {
    const char *p = address;
    char       *q;

    if (*p++ != '/' || *p++ != '/') {
      open_error_ = "HTTP address does not begin with \"//\"";
      return;
    }

    for (q = host_name; *p && *p != ':' && *p != '/'; ++p) {
      if (q < host_name + sizeof(host_name) - 1) *q++ = *p;
    }
    *q = '\0';

    if (*p == ':') {
      ++p;
      for (q = port_name; *p && *p != '/'; ++p) {
	if (q < port_name + sizeof(port_name) - 1) *q++ = *p;
      }
      *q = '\0';
    } else {
      strcpy(port_name, "80");
    }

    for (q = node_name; *p && *p != '#'; ++p) {
      if (q < node_name + sizeof(node_name) - 1) *q++ = *p;
    }
    *q = '\0';
    if (!node_name[0]) {
      strcpy(node_name, "/");
    }
  }

  struct sockaddr_in soc_address;
  soc_address.sin_family = AF_INET;

  /*
   * Parse the host name.
   */
  {
    const char *p;
    int dot_count = 0;
    for (p = host_name; *p; ++p) {
      if (*p == '.') { ++dot_count; } else if (!isdigit(*p)) break;
    }
    if (*p == '\0' && dot_count == 3) {
      soc_address.sin_addr.s_addr = inet_addr(host_name);
    } else {
      struct hostent *h = gethostbyname(host_name);
      if (
        h == 0 ||
        h->h_addrtype != AF_INET ||
        h->h_length != sizeof(struct in_addr)
      ) {
	open_error_ = "Could not resolve host name";
	return;
      }
      soc_address.sin_addr = *(struct in_addr *) h->h_addr;
    }
  }

  /*
   * Parse the port name.
   */
  if (isdigit(port_name[0])) {
    soc_address.sin_port = htons(atoi(port_name));
  } else {
    struct servent *s = getservbyname(port_name, 0);
    soc_address.sin_port = htons(s ? s->s_port : 80);
  }

  /*
   * Strip the "#anchor" suffix from the node name.
   */
  { char *p = strchr(node_name, '#'); if (p) *p = '\0'; }

  /*
   * On-the-fly definition of "FileHandle" which closes a UNIX file descriptor
   * on destruction.
   */
  class FileHandle {
  public:
    FileHandle() : fd(-1) {}
    ~FileHandle() { if (fd != -1) ::close(fd); }
    operator int() { return fd; }
    int operator=(int x) { return (fd = x); }
  private:
    int fd;
  } fd;

  fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
  if (fd == -1) {
    open_error_ = strerror(errno);
    return;
  }

  /*
   * Make the socket non-blocking, so the "connect()" can be canceled. This
   * means that when we issue the "connect()" we should NOT have to wait for
   * the accept on the other end.
   */
  if (fcntl(fd, F_SETFL, O_NONBLOCK) == -1) {
    open_error_ = strerror(errno);
    return;
  }

  /*
   * Issue the "connect()". Since the server can't do an instantaneous
   * "accept()" and we are non-blocking, this will almost certainly return
   * a negative status.
   */
  if (connect(
    fd,
    (struct sockaddr *) &soc_address, sizeof(soc_address)
  ) == -1) {
    if (errno != EINPROGRESS && errno != EAGAIN) {
      open_error_ = strerror(errno);
      return;
    }

    for (;;) {
      struct pollfd p;
      int           res;

      p.fd     = fd;
      p.events = POLLOUT;
      res = poll(&p, 1, timeout);

      /*
       * Interrupted "poll()"?
       */
      if (res == -1 && errno == EINTR) continue;

      /*
       * Check for errors.
       */
      if (res == -1 && errno != EALREADY) {
        open_error_ = strerror(errno);
	return;
      }

      /*
       * Check for timeout.
       */
      if (res == 0) {
	open_error_ = "\"connect()\" timed out";
	return;
      }

      /*
       * Extra check here for connection success, if we try to
       * connect again, and get EISCONN, it means we have a
       * successful connection.
       *
       * Notice: On SINIX 5.43 B2000, the "poll()" returns "1" when the
       * timeout occurs (!?). When we call "poll()", we encounter a SIGPIPE
       * (!?). If we ignore it, "poll()" returns EINVAL (22) (!?).
       */
      void (*sigpipe_handler)(int) = signal(SIGPIPE, SIG_IGN); /* { */
      res = connect(fd, (struct sockaddr *) &soc_address, sizeof(soc_address));
      (void) signal(SIGPIPE, sigpipe_handler); /* } */

      if (res == -1 && errno == EISCONN) break;
      if (res == -1 && errno != EALREADY) {
        open_error_ = strerror(errno);
	return;
      }
    }
  }

  /*
   * Make the socket blocking again on good "connect()".
   */
  if (fcntl(fd, F_SETFL, 0) == -1) {
    open_error_ = strerror(errno);
    return;
  }

  /*
   * Issue the HTTP request.
   *
   * Notice: "GET xyz" means "return the document without a header".
   */
  char command[4 + (sizeof(node_name) - 1) + 4 + 1];
  sprintf(command, "GET %s\r\n\r\n", node_name);
  ssize_t command_length = strlen(command);

  if (::write(fd, command, command_length) != command_length) {
    open_error_ = "Error sending HTTP GET request";
    return;
  }

  /*
   * Attach the file descriptor to the ifstream.
   */
  fd_ = fd;

  fd = -1;    // ...so that it is not implicitly "::close()"'d.
}

/* ------------------------------------------------------------------------- */

void
urlistream::file_open(const char *file_name)
{
  fd_ = !strcmp(file_name, "-") ? ::dup(0) : ::open(file_name, O_RDONLY);
  open_error_ = strerror(errno);
}

/* ------------------------------------------------------------------------- */

const char *
urlistream::open_error() const
{
  return open_error_ ? open_error_ : "No error";
}

/* ------------------------------------------------------------------------- */

int
urlistream::get()
{
  char ch;
  int ret = ::read(fd_, &ch, 1);
  return (ret > 0 ? ch : -1);
}

#ifdef TESTING /* { */

int
main(int argc, char **argv)
{
  if (argc < 2) {
    cerr << "Usage:  urlcat <url> [ ... ]" << endl;
    exit(1);
  }

  for (int i = 1; i < argc; ++i) {
    urlistream uis(argv[i]);
    if (!uis.is_open()) {
      cerr << "Opening \"" << argv[i] << "\": " << uis.open_error() << endl;
      exit(1);
    }

    for (;;) {
      int c = uis.get();
      if (c == EOF) break;
      cout << (char) c;
    }
  }

  return 0;
}

#endif /* } */

/* ------------------------------------------------------------------------- */