So my new way of parsing an HTML tag is a monstrosity. However, it takes care of the fact that there can be whitespace, i.e. something like <p id = " someId" >Yo</p> should have the id parsed.
boolHtmlProcessor::_processTag(std::string::const_iterator it1, const std::string::const_iterator it2, node & nd)
{
/*
[it1, it2): iterators for the range of the string
nd: node in which classes and ids of the tage are stored
Returns true or false depending on whether a problem was encountered during the processing.
*/
/* Get the element type, at the beginning of the tag: */
std::string elementType("");
while (_elementTypeChars.find(*it1) != std::string::npos && it1 != it2) elementType.push_back(*it1++);
if (elementType.empty()) return false;
nd.element_type = elementType;
/* Get any attributes: */
std::vector<std::pair<std::string, std::string>> attributes;
const std::pair<std::string, std::string> thisAttribute;
while (_hasNextAttribute(it1, it2, thisAttribute)) attributes.push_back(thisAttribute);
if (!_processAttributes(attributes, nd.class_list, nd.iden)) return false;
returntrue;
}
where the function _getNextAttribute is
bool HtmlProcessor::_hasNextAttribute(std::string::iterator & it1, const std::string::iterator & it2, std::pair<std::string, std::string> & attrHolder)
{
/* Parses the first HTML attributes in the iterator range [it1, it2), adding them to attrHolder; eg.
class="myClass1 myClass2" id="myId" onsubmit = "myFunction()"
---------- _hasNextAttribute -------->
attrHolder = (class, myClass1 myClass2)
When the function terminates, it1 will be the iterator to the last character parsed, will be equal to
it2 if no characters were parsed.
*/
while (*it1 == ' ' && it1 != it2) ++it1; /* Skip through left whitespace padding */
if (it1 == it2) returntrue; /* No attributes in tag; only whitespace after the element name. Such is valid HTML. */ std::string attr(""); /* String to hold the attribute type, expected after any whitespace. Should be non-empty. */
while (_attributeTypeChars.find(*it1) == std::string::npos && it1 != it2) attr.push_back(*it1++);
if (attr.empty()) return false; while (*it1 == ' ' && it1 != it2) ++it1; /* Skip through whitespace padding between the attribute name and equals sign */
if (*it1 != '=' || it1++ == it2) returnfalse; /* Current character should be an equals sign */
while (*it1 == ' ' && it1 != it2) ++it1; /* Skip through whitespace between the equals sign and quotation mark */
if (*it1 != '"' || it1++ == it2) returnfalse; /* Current character should be a quotation mark */
std::string val(""); /* String to hold the attribute's value, exepcted after the first quotation mark. */
while (_attributeValChars.find(*it1) != std::string::npos) val.push_back(*it1++);
if (attr.empty()) return false;
if (*it1 != '"' || it1++ != it2) returnfalse; /* Current character should be a quotation mark */
/* If we're here, it1 should point to the character after the quotation mark that closes off the attribute's value */
attrHolder = std::make_pair(attr, val);
}
and the function _processAttributes is
bool HtmlProcessor::_processAttributes(const std::vector<std::pair<std::string, std::string>> & attrs, std::set<std::string> &classesTarget, std::string & identifierTarget)
{
for (std::vector<std::pair<std::string, std::string>>::const_iterator it(attrs.cbegin()), offend(attrs.end()); it != offend; ++it)
{
std::string thisAttr(it->first), thisVal(it->second);
std::transform(thisAttr.begin(), thisAttr.end(), thisAttr.begin(), ::tolower);
if (thisAttr == "id")
identifierTarget = thisVal;
else if (thisAttr == "class")
{
/* Since the value for a class attribute can be several classes separated by whitespace,
add all of them to set of classes for the node.
*/
std::stringstream ss(thisAttr);
std::string thisClass;
while (std::getline(ss, thisClass, ' ')) classesTarget.insert(thisClass);
}
}
returntrue;
}
and the related data structures are
const std::stringHtmlProcessor::_elementTypeChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
const std::stringHtmlProcessor::_attributeTypeChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_";
const std::stringHtmlProcessor::_attributeValChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_ ";
struct node
{
std::string element_type;
std::set<std::string> class_list;
std::string iden;
std::vector<node*> children;
};
That node structure is how I'm building my document tree.