1
0
mirror of https://github.com/Rogiel/httpchannel synced 2025-12-06 07:32:50 +00:00

Implements a new, more clean and robust HTML parser

This commit is contained in:
2012-05-06 16:04:56 -03:00
parent 4a9da1708e
commit 0386906356
33 changed files with 1619 additions and 981 deletions

View File

@@ -22,10 +22,9 @@ import java.io.IOException;
import java.net.URI;
import java.util.regex.Pattern;
import com.rogiel.httpchannel.captcha.ImageCaptcha;
import com.rogiel.httpchannel.http.HttpContext;
import com.rogiel.httpchannel.util.PatternUtils;
import com.rogiel.httpchannel.util.htmlparser.HTMLPage;
import com.rogiel.httpchannel.util.html.Page;
/**
* This class provides utility methods to extract an {@link ImageCaptcha} from
@@ -57,8 +56,8 @@ public class ReCaptchaExtractor {
* the {@link HttpContext}
* @return the {@link ImageCaptcha} embedded at the given <code>page</code>
*/
public static ImageCaptcha extractCaptcha(HTMLPage page, HttpContext ctx) {
final String uri = page.findScriptSrc(CAPTCHA_URI_PATTERN);
public static ImageCaptcha extractCaptcha(Page page, HttpContext ctx) {
final String uri = page.scriptBySource(CAPTCHA_URI_PATTERN).asString();
if (uri == null)
return null;
try {
@@ -77,8 +76,8 @@ public class ReCaptchaExtractor {
* the {@link HttpContext}
* @return the {@link ImageCaptcha} contained at the given <code>page</code>
*/
public static ImageCaptcha extractAjaxCaptcha(HTMLPage page, HttpContext ctx) {
final String siteID = page.findScript(CAPTCHA_ID_PATTERN, 1);
public static ImageCaptcha extractAjaxCaptcha(Page page, HttpContext ctx) {
final String siteID = page.script(CAPTCHA_ID_PATTERN).asString(1);
try {
return doExtract(ctx.get(CHALLENGE_BASE_URI + siteID).asString());
} catch (IOException e) {

View File

@@ -33,7 +33,7 @@ import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
import com.rogiel.httpchannel.util.HttpClientUtils;
import com.rogiel.httpchannel.util.htmlparser.HTMLPage;
import com.rogiel.httpchannel.util.html.Page;
public abstract class Request {
private static final JSONParser jsonParser = new JSONParser();
@@ -90,14 +90,14 @@ public abstract class Request {
});
}
public HTMLPage asPage() throws ClientProtocolException, IOException {
return HTMLPage.parse(asString());
public Page asPage() throws ClientProtocolException, IOException {
return Page.parse(asString());
}
public Future<HTMLPage> asPageAsync() throws IOException {
return ctx.threadPool.submit(new Callable<HTMLPage>() {
public Future<Page> asPageAsync() throws IOException {
return ctx.threadPool.submit(new Callable<Page>() {
@Override
public HTMLPage call() throws Exception {
public Page call() throws Exception {
return asPage();
}
});

View File

@@ -31,7 +31,7 @@ import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import com.rogiel.httpchannel.util.htmlparser.HTMLPage;
import com.rogiel.httpchannel.util.html.Page;
public class HttpClientUtils {
private static final ExecutorService threadPool = Executors
@@ -82,7 +82,7 @@ public class HttpClientUtils {
}
}
public static HTMLPage toPage(HttpResponse response) throws IOException {
return HTMLPage.parse(toString(response));
public static Page toPage(HttpResponse response) throws IOException {
return Page.parse(toString(response));
}
}

View File

@@ -0,0 +1,174 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.rogiel.httpchannel.util.html;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.htmlparser.Tag;
/**
* An {@link PageElement} that has an matched string attached to it
*
* @author <a href="http://www.rogiel.com">Rogiel</a>
*/
public class MatchedElement<T extends Tag> extends PageElement<T> {
/**
* The regular expression {@link Matcher} that retains the matched strings
* to it
*/
private final Matcher matcher;
/**
* @param tag
* the tag
* @param matcher
* the matcher
*/
public MatchedElement(T tag, Matcher matcher) {
super(tag);
this.matcher = matcher;
}
/**
* @param tag
* the tag
* @param pattern
* the pattern
* @param content
* the content
*/
public MatchedElement(T tag, Pattern pattern, String content) {
super(tag);
this.matcher = pattern.matcher(content);
}
/**
* @param tag
* the tag
* @param content
* the content
*/
public MatchedElement(T tag, String content) {
this(tag, Pattern.compile(Pattern.quote(content)), content);
this.matcher.matches();
}
/**
* @return <code>true</code> if the element has an matched element
*/
public boolean matches() {
matcher.reset();
return matcher.matches();
}
/**
* @return <code>true</code> if the element has an matched element (the
* entire value matches the pattern)
*/
public boolean matchesEntirelly() {
return matcher.lookingAt();
}
/**
* @return <code>true</code> if the pattern has found something on the
* element that matches it
*/
public boolean find() {
matcher.reset();
return matcher.find();
}
/**
* @param n
* the group number
* @return <code>true</code> if the group exists
*/
public boolean hasGroup(int n) {
return n <= matcher.groupCount();
}
/**
* @return the entire matched value as a string
*/
public String asString() {
return asString(0);
}
/**
* @return the group value as a string
*/
public String asString(int n) {
return matcher.group(n);
}
/**
* @return the entire matched value as a integer
*/
public int asInteger() {
return asInteger(0);
}
/**
* @return the group value as a integer
*/
public int asInteger(int n) {
return Integer.parseInt(asString(n));
}
/**
* @return the entire matched value as a long
*/
public long asLong() {
return asLong(0);
}
/**
* @return the group value as a long
*/
public long asLong(int n) {
return Long.parseLong(asString(n));
}
/**
* @return the entire matched value as a double
*/
public double asDouble() {
return asDouble(0);
}
/**
* @return the group value as a double
*/
public double asDouble(int n) {
return Double.parseDouble(asString(n));
}
/**
* @return the pattern matched against the element
*/
public Pattern getPattern() {
return matcher.pattern();
}
@Override
public String toString() {
return "MatchedElement [tag=" + tag + ", pattern=" + getPattern() + "]";
}
}

View File

@@ -0,0 +1,841 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.rogiel.httpchannel.util.html;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Pattern;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.tags.FormTag;
import org.htmlparser.tags.FrameTag;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.InputTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.ScriptTag;
import org.htmlparser.tags.TextareaTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import com.rogiel.httpchannel.util.html.PageElement.TagMatcher;
import com.rogiel.httpchannel.util.html.filter.TypeTagFilter;
import com.rogiel.httpchannel.util.html.matcher.IDTagMatcher;
import com.rogiel.httpchannel.util.html.matcher.NameTagMatcher;
/**
* This class handles all HTML parsing and searching. With this class is easy to
* search for links matching an {@link Pattern}, for images, frames, forms,
* inputs and maany more HTML widgets.
*
* @author <a href="http://www.rogiel.com">Rogiel</a>
*/
public class Page {
/**
* The list of nodes on the HTML DOM model
*/
private final NodeList nodes;
/**
* This interface provides a mean to transform an list of objects into
* another type
*
* @author <a href="http://www.rogiel.com">Rogiel</a>
*
* @param <I>
* the input object type
* @param <O>
* the output object type
*/
private interface ListProcessor<I extends Tag, O> {
O process(I tag);
}
/**
* An default {@link ListProcessor} that converts all tags to an
* {@link PageElement}
*
* @author <a href="http://www.rogiel.com">Rogiel</a>
*
* @param <I>
* the input type
*/
private class DefaultListProcessor<I extends Tag> implements
ListProcessor<I, PageElement<I>> {
@Override
public PageElement<I> process(I tag) {
return new PageElement<I>(tag);
}
}
/**
* Creates a new page instance
*
* @param parser
* the HTML parser
* @throws ParserException
* an parsing exception
*/
public Page(Parser parser) throws ParserException {
this.nodes = parser.parse(null);
}
/*
* ************************************************************************
* ***** INTERNAL
* ************************************************************************
*/
/**
* Filters all the tags within this page to those matching the filter
*
* @param processor
* the list processor
* @param filters
* the filters to be applied
* @return an list of matching tags
*/
private <T extends Tag, O> List<O> filter(ListProcessor<T, O> processor,
NodeFilter... filters) {
final NodeFilter filter;
if (filters.length == 1)
filter = filters[0];
else
filter = new AndFilter(filters);
try {
return list(nodes.extractAllNodesThatMatch(filter, true), processor);
} catch (ParserException e) {
return Collections.emptyList();
}
}
/**
* Creates a list of converted objects
*
* @param list
* the input list
* @param processor
* the processor that converts the object types
* @return the processed and converted list
* @throws ParserException
* if any exception occur
*/
@SuppressWarnings("unchecked")
private <T extends Tag, O> List<O> list(final NodeList list,
ListProcessor<T, O> processor) throws ParserException {
final List<O> filtered = new ArrayList<>();
final NodeIterator iterator = list.elements();
while (iterator.hasMoreNodes()) {
filtered.add(processor.process((T) iterator.nextNode()));
}
return filtered;
}
/**
* Tries to search for a tag value that matches exactly (the entire string)
* with the pattern.
*
* @param list
* the list of elements
* @param pattern
* the pattern
* @param tagMatcher
* the tag matcher (which will be matched against the pattern)
* @param realMatcher
* the real matcher (which will be returned on the
* {@link MatchedElement})
* @return an list of {@link MatchedElement}
*/
private <T extends Tag, E extends PageElement<T>> List<MatchedElement<T>> match(
List<E> list, Pattern pattern, TagMatcher<T> tagMatcher,
TagMatcher<T> realMatcher) {
final List<MatchedElement<T>> matchList = new ArrayList<>();
for (final E tag : list) {
final MatchedElement<T> matched = tag.match(pattern, tagMatcher);
if (matched == null)
continue;
if (matched.matches()) {
if (tagMatcher == realMatcher) {
matchList.add(matched);
} else {
matchList.add(tag.match(realMatcher));
}
}
}
return matchList;
}
/**
* Tries to search for a tag value that matches exactly (the entire string)
* with the pattern.
*
* @param list
* the list of elements
* @param pattern
* the pattern
* @param tagMatcher
* the tag matcher (which will be matched against the pattern and
* used on {@link MatchedElement})
* @return an list of {@link MatchedElement}
*/
private <T extends Tag, E extends PageElement<T>> List<MatchedElement<T>> match(
List<E> list, Pattern pattern, TagMatcher<T> tagMatcher) {
return match(list, pattern, tagMatcher, tagMatcher);
}
/**
* Tries to search for a tag value that contains the content within the
* pattern.
*
* @param list
* the list of elements
* @param pattern
* the pattern
* @param tagMatcher
* the tag matcher (which will be matched against the pattern and
* used on {@link MatchedElement})
* @return an list of {@link MatchedElement}
*/
private <T extends Tag, E extends PageElement<T>> List<MatchedElement<T>> find(
List<E> list, Pattern pattern, TagMatcher<T> tagMatcher) {
final List<MatchedElement<T>> matchList = new ArrayList<>();
for (final E tag : list) {
final MatchedElement<T> matched = tag.match(pattern, tagMatcher);
if (matched.find())
matchList.add(matched);
}
return matchList;
}
/**
* Returns a single element from the list
*
* @param list
* the list
* @return the first element at the list
*/
private <O> O single(List<O> list) {
if (list.size() == 0)
return null;
return list.get(0);
}
/**
* Parses the HTML page to a plain string. This is similar to the
* "SEO preview" systems
*
* @return
*/
public String asPlainString() {
String string = nodes.asString().replaceAll("&nbsp;", "");
final String[] lines = string.split("\n");
final StringBuilder builder = new StringBuilder();
for (final String line : lines) {
String procLine = line.replaceAll("\t", " ").trim();
if (procLine.length() == 0)
continue;
builder.append(line.replaceAll("\t", " ").trim()).append(" ");
}
return builder.toString();
}
/*
* ************************************************************************
* ***** TEXT SEARCH
* ************************************************************************
*/
/**
* Searches for the given pattern at the entire page
*
* @param pattern
* the pattern
* @return the search results
*/
public SearchResults search(Pattern pattern) {
return new SearchResults(pattern, asPlainString());
}
/**
* Searches for the given text at the entire page
*
* @param text
* the text
* @return the search results
*/
public SearchResults searchFirst(String text) {
return search(Pattern.compile(Pattern.quote(text)));
}
/*
* ************************************************************************
* ***** LINKS
* ************************************************************************
*/
/**
* An {@link TagMatcher} that returns the link href
*/
private static final TagMatcher<LinkTag> LINK_TAG_MATCHER = new TagMatcher<LinkTag>() {
@Override
public String content(LinkTag tag) {
return tag.getLink();
}
};
/**
* @return a list of all links contained at the page
*/
public List<PageElement<LinkTag>> links() {
return filter(new DefaultListProcessor<LinkTag>(), new TypeTagFilter(
LinkTag.class));
}
/**
* Return all links whose URL matches the given pattern
*
* @param pattern
* the pattern
* @return the list of links matching the pattern
*/
public List<MatchedElement<LinkTag>> links(Pattern pattern) {
return match(links(), pattern, LINK_TAG_MATCHER);
}
/**
* Return the first link whose URL matches the given pattern
*
* @param pattern
* the pattern
* @return the first link matching the pattern
*/
public MatchedElement<LinkTag> link(Pattern pattern) {
return single(links(pattern));
}
/**
* @param pattern
* the pattern
* @return the links whose IDs matches the pattern
*/
public List<MatchedElement<LinkTag>> linksByID(Pattern pattern) {
return match(links(), pattern, new IDTagMatcher<LinkTag>(),
LINK_TAG_MATCHER);
}
/**
* @param id
* the link ID
* @return the link with the given ID
*/
public MatchedElement<LinkTag> linkByID(String id) {
return single(linksByID(Pattern.compile(Pattern.quote(id))));
}
/**
* @param pattern
* the name pattern
* @return the links whose name matches the pattern
*/
public List<MatchedElement<LinkTag>> linksByName(Pattern pattern) {
return match(links(), pattern, new NameTagMatcher<LinkTag>(),
LINK_TAG_MATCHER);
}
/**
* @param name
* the name
* @return the link with the given name
*/
public MatchedElement<LinkTag> linkByName(String name) {
return single(linksByName(Pattern.compile(Pattern.quote(name))));
}
/*
* ************************************************************************
* ***** IMAGES
* ************************************************************************
*/
/**
* An {@link TagMatcher} that returns the image source url
*/
private static final TagMatcher<ImageTag> IMAGE_TAG_MATCHER = new TagMatcher<ImageTag>() {
@Override
public String content(ImageTag tag) {
return tag.getImageURL();
}
};
/**
* @return the list of all images at the page
*/
public List<PageElement<ImageTag>> images() {
return filter(new DefaultListProcessor<ImageTag>(), new TypeTagFilter(
ImageTag.class));
}
/**
* @param pattern
* the image url pattern
* @return the list of images matching the url pattern
*/
public List<MatchedElement<ImageTag>> images(Pattern pattern) {
return match(images(), pattern, IMAGE_TAG_MATCHER);
}
/**
* @param pattern
* the image url pattern
* @return the first image whose url matches the pattern
*/
public MatchedElement<ImageTag> image(Pattern pattern) {
return single(images(pattern));
}
/**
* @param pattern
* the pattern id
* @return the list of images that match the given id
*/
public List<MatchedElement<ImageTag>> imagesByID(Pattern pattern) {
return match(images(), pattern, new IDTagMatcher<ImageTag>(),
IMAGE_TAG_MATCHER);
}
/**
* @param id
* the image ID
* @return the image that matches with the given id
*/
public MatchedElement<ImageTag> imageByID(String id) {
return single(imagesByID(Pattern.compile(Pattern.quote(id))));
}
/**
* @param pattern
* the image name pattern
* @return the list of images whose names match the pattern
*/
public List<MatchedElement<ImageTag>> imagesByName(Pattern pattern) {
return match(images(), pattern, new NameTagMatcher<ImageTag>(),
IMAGE_TAG_MATCHER);
}
/**
* @param name
* the image name
* @return the image whose name matches the given
*/
public MatchedElement<ImageTag> imageByName(String name) {
return single(imagesByName(Pattern.compile(Pattern.quote(name))));
}
/*
* ************************************************************************
* ***** FORM
* ************************************************************************
*/
/**
* An {@link TagMatcher} that returns the form action (or submit) url
*/
private static final TagMatcher<FormTag> FORM_TAG_MATCHER = new TagMatcher<FormTag>() {
@Override
public String content(FormTag tag) {
return tag.getFormLocation();
}
};
/**
* @return the list of all forms on the page
*/
public List<PageElement<FormTag>> forms() {
return filter(new DefaultListProcessor<FormTag>(), new TypeTagFilter(
FormTag.class));
}
/**
* @param pattern
* the action url pattern
* @return the forms whose urls matches the pattern
*/
public List<MatchedElement<FormTag>> forms(Pattern pattern) {
return match(forms(), pattern, FORM_TAG_MATCHER);
}
/**
* @param pattern
* the action url pattern
* @return the first form whose action url matches the pattern
*/
public MatchedElement<FormTag> form(Pattern pattern) {
return single(forms(pattern));
}
/**
* @param pattern
* the form id pattern
* @return the forms whose ids matches the pattern
*/
public List<MatchedElement<FormTag>> formsByID(Pattern pattern) {
return match(forms(), pattern, new IDTagMatcher<FormTag>(),
FORM_TAG_MATCHER);
}
/**
* @param id
* the form id
* @return the form whose id matches the given
*/
public MatchedElement<FormTag> formByID(String id) {
return single(formsByID(Pattern.compile(Pattern.quote(id))));
}
/**
* @param pattern
* the form name pattern
* @return the forms whose names matches the pattern
*/
public List<MatchedElement<FormTag>> formsByName(Pattern pattern) {
return match(forms(), pattern, new NameTagMatcher<FormTag>(),
FORM_TAG_MATCHER);
}
/**
* @param name
* the form name
* @return the form whose name matches the given
*/
public MatchedElement<FormTag> formByName(String name) {
return single(formsByName(Pattern.compile(Pattern.quote(name))));
}
/*
* ************************************************************************
* ***** INPUT
* ************************************************************************
*/
/**
* An {@link TagMatcher} that returns the input value
*/
private static final TagMatcher<InputTag> INPUT_TAG_MATCHER = new TagMatcher<InputTag>() {
@Override
public String content(InputTag tag) {
return tag.getAttribute("value");
}
};
/**
* @return the list of all inputs on the page
*/
public List<PageElement<InputTag>> inputs() {
return filter(new DefaultListProcessor<InputTag>(), new TypeTagFilter(
InputTag.class));
}
/**
* @param pattern
* the input value pattern
* @return the inputs whose values matches the pattern
*/
public List<MatchedElement<InputTag>> inputs(Pattern pattern) {
return find(inputs(), pattern, INPUT_TAG_MATCHER);
}
/**
* @param pattern
* the action url pattern
* @return the first input whose value matches the pattern
*/
public MatchedElement<InputTag> input(Pattern pattern) {
return single(inputs(pattern));
}
/**
* @param pattern
* the input id pattern
* @return the inputs whose ids matches the pattern
*/
public List<MatchedElement<InputTag>> inputsByID(Pattern pattern) {
return match(inputs(), pattern, new IDTagMatcher<InputTag>(),
INPUT_TAG_MATCHER);
}
/**
* @param name
* the input id
* @return the input whose id matches the given
*/
public MatchedElement<InputTag> inputByID(String id) {
return single(inputsByID(Pattern.compile(Pattern.quote(id))));
}
/**
* @param pattern
* the input name pattern
* @return the inputs whose name matches the pattern
*/
public List<MatchedElement<InputTag>> inputsByName(Pattern pattern) {
return match(inputs(), pattern, new NameTagMatcher<InputTag>(),
INPUT_TAG_MATCHER);
}
/**
* @param name
* the input name
* @return the input whose name matches the given
*/
public MatchedElement<InputTag> inputByName(String name) {
return single(inputsByName(Pattern.compile(Pattern.quote(name))));
}
/*
* ************************************************************************
* ***** TEXTAREA
* ************************************************************************
*/
/**
* An {@link TagMatcher} that returns the textarea value
*/
private static final TagMatcher<TextareaTag> TEXTAREA_TAG_MATCHER = new TagMatcher<TextareaTag>() {
@Override
public String content(TextareaTag tag) {
return tag.getStringText();
}
};
/**
* @return the list of all textareas on the page
*/
public List<PageElement<TextareaTag>> textareas() {
return filter(new DefaultListProcessor<TextareaTag>(),
new TypeTagFilter(TextareaTag.class));
}
/**
* @param pattern
* the textarea value pattern
* @return the textareas whose values matches the pattern
*/
public List<MatchedElement<TextareaTag>> textareas(Pattern pattern) {
return match(textareas(), pattern, TEXTAREA_TAG_MATCHER);
}
/**
* @param pattern
* the textarea value pattern
* @return the first textarea whose value matches the pattern
*/
public MatchedElement<TextareaTag> textarea(Pattern pattern) {
return single(textareas(pattern));
}
/**
* @param pattern
* the textarea id pattern
* @return the textareas whose ids matches the pattern
*/
public List<MatchedElement<TextareaTag>> textareasByID(Pattern pattern) {
return match(textareas(), pattern, new IDTagMatcher<TextareaTag>(),
TEXTAREA_TAG_MATCHER);
}
/**
* @param name
* the textarea id
* @return the textarea whose id matches the given
*/
public MatchedElement<TextareaTag> textareaByID(String id) {
return single(textareasByID(Pattern.compile(Pattern.quote(id))));
}
/**
* @param pattern
* the textarea name pattern
* @return the textareas whose name matches the pattern
*/
public List<MatchedElement<TextareaTag>> textareasByName(Pattern pattern) {
return match(textareas(), pattern, new NameTagMatcher<TextareaTag>(),
TEXTAREA_TAG_MATCHER);
}
/**
* @param name
* the textarea name
* @return the textarea whose name matches the given
*/
public MatchedElement<TextareaTag> textareaByName(String name) {
return single(textareasByName(Pattern.compile(Pattern.quote(name))));
}
/*
* ************************************************************************
* ***** JAVASCRIPT
* ************************************************************************
*/
/**
* An {@link TagMatcher} that returns the script code
*/
public List<PageElement<ScriptTag>> scripts() {
return filter(new DefaultListProcessor<ScriptTag>(), new TypeTagFilter(
ScriptTag.class));
}
/**
* @return the list of all scripts on the page
*/
public List<MatchedElement<ScriptTag>> scripts(Pattern pattern) {
return find(scripts(), pattern, new TagMatcher<ScriptTag>() {
@Override
public String content(ScriptTag tag) {
return tag.getScriptCode();
}
});
}
/**
* @param pattern
* the script code pattern
* @return the first script whose code matches the pattern
*/
public MatchedElement<ScriptTag> script(Pattern pattern) {
return single(scripts(pattern));
}
/**
* @param pattern
* the script url pattern
* @return the scripts whose urls matches the pattern
*/
public MatchedElement<ScriptTag> scriptBySource(Pattern pattern) {
return single(match(scripts(), pattern, new TagMatcher<ScriptTag>() {
@Override
public String content(ScriptTag tag) {
return tag.getAttribute("src");
}
}));
}
/*
* ************************************************************************
* ***** FRAME
* ************************************************************************
*/
/**
* An {@link TagMatcher} that returns the frame url
*/
private static final TagMatcher<FrameTag> FRAME_TAG_MATCHER = new TagMatcher<FrameTag>() {
@Override
public String content(FrameTag tag) {
return tag.getFrameLocation();
}
};
/**
* @return the list of all frames on the page
*/
public List<PageElement<FrameTag>> frames() {
return filter(new DefaultListProcessor<FrameTag>(), new TypeTagFilter(
FrameTag.class));
}
/**
* @param pattern
* the frame url pattern
* @return the frames whose urls matches the pattern
*/
public List<MatchedElement<FrameTag>> frames(Pattern pattern) {
return match(frames(), pattern, FRAME_TAG_MATCHER);
}
/**
* @param pattern
* the frame url pattern
* @return the first frame whose url matches the pattern
*/
public MatchedElement<FrameTag> frame(Pattern pattern) {
return single(frames(pattern));
}
/**
* @param pattern
* the frame id pattern
* @return the frames whose id matches the pattern
*/
public List<MatchedElement<FrameTag>> framesByID(Pattern pattern) {
return match(frames(), pattern, new IDTagMatcher<FrameTag>(),
FRAME_TAG_MATCHER);
}
/**
* @param name
* the frame id
* @return the frame whose id matches the given
*/
public MatchedElement<FrameTag> frameByID(String id) {
return single(framesByID(Pattern.compile(Pattern.quote(id))));
}
/**
* @param pattern
* the frame name pattern
* @return the frames whose name matches the pattern
*/
public List<MatchedElement<FrameTag>> framesByName(Pattern pattern) {
return match(frames(), pattern, new NameTagMatcher<FrameTag>(),
FRAME_TAG_MATCHER);
}
/**
* @param name
* the frame name
* @return the frame whose name matches the given
*/
public MatchedElement<FrameTag> frameByName(String name) {
return single(framesByName(Pattern.compile(Pattern.quote(name))));
}
/*
* ************************************************************************
* ***** INITIALIZERS
* ************************************************************************
*/
/**
* Creates a new page parsing the HTML input
*
* @param html
* the html code
* @return the newly created {@link Page} object
*/
public static Page parse(String html) {
try {
return new Page(Parser.createParser(html, null));
} catch (ParserException e) {
return null;
}
}
@Override
public String toString() {
return nodes.toHtml(false);
}
}

View File

@@ -0,0 +1,129 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.rogiel.httpchannel.util.html;
import java.util.regex.Pattern;
import org.htmlparser.Tag;
/**
* An element that represents an tag on the page
*
* @author <a href="http://www.rogiel.com">Rogiel</a>
*/
public class PageElement<T extends Tag> {
/**
* The tag represented by this element
*/
protected final T tag;
/**
* Creates a new instance
*
* @param tag
* the tag
*/
public PageElement(T tag) {
this.tag = tag;
}
/**
* Tries to match the element with a given pattern
*
* @param pattern
* the pattern
* @return the matched element
*/
public MatchedElement<T> match(Pattern pattern) {
return match(pattern, null);
}
/**
* Tries to match the element with a given pattern using an alternative
* {@link TagMatcher}
*
* @param pattern
* the pattern
* @param tagMatcher
* the tag matcher
* @return the matched element
*/
public MatchedElement<T> match(Pattern pattern, TagMatcher<T> tagMatcher) {
if (tagMatcher == null) {
tagMatcher = new TagMatcher<T>() {
@Override
public String content(T tag) {
return tag.toHtml();
}
};
}
final String content = tagMatcher.content(tag);
if (content == null)
return null;
return new MatchedElement<T>(tag, pattern, tagMatcher.content(tag));
}
/**
* Tries to match the element with itself (return a {@link MatchedElement}
* that always matched it self)
*
* @param tagMatcher
* the tag matcher
* @return always an {@link MatchedElement} whose group 0 matches it self
*/
public MatchedElement<T> match(TagMatcher<T> tagMatcher) {
if (tagMatcher == null) {
tagMatcher = new TagMatcher<T>() {
@Override
public String content(T tag) {
return tag.toHtml();
}
};
}
final String content = tagMatcher.content(tag);
if (content == null)
return null;
return new MatchedElement<T>(tag, tagMatcher.content(tag));
}
/**
* An tag matcher is an helper class that can return an value that the
* matcher should use to test the pattern against it.
*
* @author <a href="http://www.rogiel.com">Rogiel</a>
*
* @param <T>
* the tag type
*/
public interface TagMatcher<T extends Tag> {
String content(T tag);
}
/**
* @return the tag object
*/
public T tag() {
return tag;
}
@Override
public String toString() {
return "PageElement [tag=" + tag + "]";
}
}

View File

@@ -0,0 +1,137 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.rogiel.httpchannel.util.html;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Represents an search done against an page string
*
* @author <a href="http://www.rogiel.com">Rogiel</a>
*/
public class SearchResults {
/**
* The matcher
*/
private final Matcher matcher;
/**
* Creates a new instance
*
* @param matcher
* the matcher
*/
public SearchResults(Matcher matcher) {
this.matcher = matcher;
}
/**
* Creates a new instance
*
* @param pattern
* the pattern
* @param content
* the content
*/
public SearchResults(Pattern pattern, String content) {
this.matcher = pattern.matcher(content);
this.matcher.find();
}
/**
* @return <code>true</code> if the matcher has found any results
*/
public boolean hasResults() {
matcher.reset();
return matcher.find();
}
/**
* @param n
* the group number
* @return <code>true</code> if the group exists
*/
public boolean hasGroup(int n) {
return n <= matcher.groupCount();
}
/**
* @return the entire matched value as a string
*/
public String asString() {
return asString(0);
}
/**
* @return the group value as a string
*/
public String asString(int n) {
return matcher.group(n);
}
/**
* @return the entire matched value as a integer
*/
public int asInteger() {
return asInteger(0);
}
/**
* @return the group value as a integer
*/
public int asInteger(int n) {
return Integer.parseInt(asString(n));
}
/**
* @return the entire matched value as a long
*/
public long asLong() {
return asLong(0);
}
/**
* @return the group value as a long
*/
public long asLong(int n) {
return Long.parseLong(asString(n));
}
/**
* @return the entire matched value as a double
*/
public double asDouble() {
return asDouble(0);
}
/**
* @return the group value as a double
*/
public double asDouble(int n) {
return Double.parseDouble(asString(n));
}
/**
* @return the pattern matched against the element
*/
public Pattern getPattern() {
return matcher.pattern();
}
}

View File

@@ -16,29 +16,36 @@
* specific language governing permissions and limitations
* under the License.
*/
package com.rogiel.httpchannel.util.htmlparser;
package com.rogiel.httpchannel.util.html.filter;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Tag;
public class IDFilter implements NodeFilter {
/**
* An filter that selects all tags matching an given type
*
* @author <a href="http://www.rogiel.com">Rogiel</a>
*/
public class TypeTagFilter implements NodeFilter {
private static final long serialVersionUID = 1L;
private final String id;
/**
* The tag type
*/
private final Class<? extends Tag> type;
public IDFilter(String id) {
this.id = id;
/**
* Creates a new instance
*
* @param type
* the tag type
*/
public TypeTagFilter(Class<? extends Tag> type) {
this.type = type;
}
@Override
public boolean accept(Node node) {
if (!(node instanceof Tag))
return false;
final Tag tag = (Tag) node;
if (tag.getAttribute("id") == null)
return false;
if (!tag.getAttribute("id").equals(id))
return false;
return true;
return type.isAssignableFrom(node.getClass());
}
}

View File

@@ -1,38 +1,36 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.rogiel.httpchannel.util.htmlparser;
import java.util.regex.Pattern;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
public class ContainsFilter implements NodeFilter {
private static final long serialVersionUID = 1L;
private final Pattern content;
public ContainsFilter(Pattern content) {
this.content = content;
}
@Override
public boolean accept(Node node) {
return content.matcher(node.getText()).find();
}
}
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.rogiel.httpchannel.util.html.matcher;
import org.htmlparser.Tag;
import com.rogiel.httpchannel.util.html.PageElement.TagMatcher;
/**
* An {@link TagMatcher} that always returns the tag ID
*
* @author <a href="http://www.rogiel.com">Rogiel</a>
*/
public class IDTagMatcher<T extends Tag> implements TagMatcher<T> {
@Override
public String content(T tag) {
return tag.getAttribute("id");
}
}

View File

@@ -1,38 +1,36 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.rogiel.httpchannel.util.htmlparser;
import java.util.regex.Pattern;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
public class ContainsInLowerCaseFilter implements NodeFilter {
private static final long serialVersionUID = 1L;
private final Pattern content;
public ContainsInLowerCaseFilter(Pattern content) {
this.content = content;
}
@Override
public boolean accept(Node node) {
return content.matcher(node.getText().toLowerCase()).find();
}
}
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.rogiel.httpchannel.util.html.matcher;
import org.htmlparser.Tag;
import com.rogiel.httpchannel.util.html.PageElement.TagMatcher;
/**
* An {@link TagMatcher} that always returns the tag name
*
* @author <a href="http://www.rogiel.com">Rogiel</a>
*/
public class NameTagMatcher<T extends Tag> implements TagMatcher<T> {
@Override
public String content(T tag) {
return tag.getAttribute("name");
}
}

View File

@@ -1,42 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.rogiel.httpchannel.util.htmlparser;
import java.util.regex.Pattern;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.tags.FormTag;
public class FormActionPatternFilter implements NodeFilter {
private static final long serialVersionUID = 1L;
private final Pattern pattern;
public FormActionPatternFilter(Pattern pattern) {
this.pattern = pattern;
}
@Override
public boolean accept(Node node) {
if (!(node instanceof FormTag))
return false;
final FormTag form = (FormTag) node;
return pattern.matcher(form.getFormLocation()).matches();
}
}

View File

@@ -1,44 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.rogiel.httpchannel.util.htmlparser;
import java.util.regex.Pattern;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.nodes.TagNode;
public class FramePatternFilter implements NodeFilter {
private static final long serialVersionUID = 1L;
private final Pattern pattern;
public FramePatternFilter(Pattern pattern) {
this.pattern = pattern;
}
@Override
public boolean accept(Node node) {
if (!(node instanceof TagNode))
return false;
final TagNode frame = (TagNode) node;
if (frame.getAttribute("src") == null)
return false;
return pattern.matcher(frame.getAttribute("src")).matches();
}
}

View File

@@ -1,304 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.rogiel.httpchannel.util.htmlparser;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.tags.FormTag;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.InputTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.ScriptTag;
import org.htmlparser.tags.TextareaTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
/**
* @author <a href="http://www.rogiel.com">Rogiel</a>
*/
public class HTMLPage {
private final NodeList nodes;
private HTMLPage(Parser parser) throws ParserException {
this.nodes = parser.parse(null);
}
private <T extends Node> List<T> filter(final Class<T> nodeType,
NodeFilter... filters) {
final NodeFilter filter;
if (filters.length == 1)
filter = filters[0];
else
filter = new AndFilter(filters);
try {
return list(nodes.extractAllNodesThatMatch(filter, true));
} catch (ParserException e) {
return Collections.emptyList();
}
}
@SuppressWarnings("unchecked")
private <T extends Node> List<T> list(final NodeList list)
throws ParserException {
final List<T> filtered = new ArrayList<>();
final NodeIterator iterator = list.elements();
while (iterator.hasMoreNodes()) {
filtered.add((T) iterator.nextNode());
}
return filtered;
}
public boolean containsPlain(Pattern pattern) {
return pattern.matcher(asString()).find();
}
public boolean contains(final Pattern pattern) {
return !filter(Node.class, new ContainsFilter(pattern)).isEmpty();
}
public boolean contains(final String text) {
return contains(Pattern.compile(Pattern.quote(text)));
}
public boolean containsIgnoreCase(final String text) {
return !filter(
Node.class,
new ContainsInLowerCaseFilter(Pattern.compile(Pattern
.quote(text.toLowerCase())))).isEmpty();
}
public String findPlain(final Pattern pattern, int n) {
final Matcher matcher = pattern.matcher(asString());
if (matcher.find())
return matcher.group(n);
return null;
}
public int findIntPlain(final Pattern pattern, int n) {
return Integer.parseInt(findPlain(pattern, n));
}
public double findDoublePlain(final Pattern pattern, int n) {
return Double.parseDouble(findPlain(pattern, n));
}
public String find(final Pattern pattern, int n) {
for (final Node tag : filter(Tag.class, new ContainsFilter(pattern))) {
final Matcher matcher = pattern.matcher(tag.getText());
if (matcher.find())
return matcher.group(n);
}
return null;
}
public int findAsInt(final Pattern pattern, int n) {
String found = find(pattern, n);
if (found == null)
return 0;
return Integer.parseInt(findScript(pattern, n));
}
/**
* Tries to find a link that has an URI following the given pattern
*
* @param pattern
* the pattern
* @return the link content, if found. <code>null</code> otherwise
*/
public String findLink(final Pattern pattern) {
for (final LinkTag tag : filter(LinkTag.class, new LinkPatternFilter(
pattern))) {
return tag.getLink();
}
return null;
}
/**
* Tries to find a frame that has an URI following the given pattern
*
* @param pattern
* the pattern
* @return the iframe uri, if found. <code>null</code> otherwise
*/
public String findFrame(final Pattern pattern) {
for (final TagNode tag : filter(TagNode.class, new FramePatternFilter(
pattern))) {
return tag.getAttribute("src");
}
return null;
}
/**
* Tries to find a image that has an URI following the given pattern
*
* @param pattern
* the pattern
* @return the iframe uri, if found. <code>null</code> otherwise
*/
public String findImage(final Pattern pattern) {
for (final ImageTag tag : filter(ImageTag.class,
new ImagePatternFilter(pattern))) {
return tag.getImageURL();
}
return null;
}
/**
* Tries to find a form which has an location that respects the given
* pattern
*
* @param pattern
* the pattern
* @return the URI found, if any. <code>null</code> otherwise
*/
public String findFormAction(final Pattern pattern) {
for (final FormTag tag : filter(FormTag.class,
new FormActionPatternFilter(pattern))) {
return tag.getFormLocation();
}
return null;
}
private String inputValue(List<InputTag> tags) {
for (final InputTag tag : tags) {
return tag.getAttribute("value");
}
return null;
}
public String getInputValue(final String inputName) {
return inputValue(filter(InputTag.class, new InputNameFilter(inputName)));
}
public int getInputValueAsInt(final String inputName) {
return Integer.parseInt(getInputValue(inputName));
}
public String getInputValueById(final String id) {
return inputValue(filter(InputTag.class, new InputIDFilter(id)));
}
public int getInputValueByIdInt(final String id) {
return Integer.parseInt(inputValue(filter(InputTag.class,
new InputIDFilter(id))));
}
public String getInputValue(final Pattern pattern) {
return inputValue(filter(InputTag.class, new InputValuePatternFilter(
pattern)));
}
public String getTextareaValueById(String id) {
return ((TextareaTag) getTagByID(id)).getStringText();
}
public String getTextareaValueByName(String name) {
return ((TextareaTag) getTagByName(name)).getStringText();
}
public Tag getTagByID(final String id) {
for (final Tag tag : filter(Tag.class, new IDFilter(id))) {
return tag;
}
return null;
}
public Tag getTagByName(final String name) {
for (final Tag tag : filter(Tag.class, new NameFilter(name))) {
return tag;
}
return null;
}
public String findScript(final Pattern pattern, int n) {
for (final ScriptTag tag : filter(ScriptTag.class,
new ScriptContainsFilter(pattern))) {
final Matcher matcher = pattern.matcher(tag.getScriptCode());
if (matcher.find())
return matcher.group(n);
}
return null;
}
public String findScriptSrc(final Pattern pattern) {
for (final ScriptTag tag : filter(ScriptTag.class, new ScriptSrcFilter(
pattern))) {
final Matcher matcher = pattern.matcher(tag.getAttribute("src"));
if (matcher.matches())
return matcher.group();
}
return null;
}
public int findScriptAsInt(final Pattern pattern, int n) {
String found = findScript(pattern, n);
if (found == null)
return 0;
return Integer.parseInt(found);
}
public String toString() {
// try {
// return parser.parse(null).toHtml(false);
// } catch (ParserException e1) {
// return null;
// }
return nodes.toHtml(false);
}
public static HTMLPage parse(String html) {
try {
return new HTMLPage(Parser.createParser(html, null));
} catch (ParserException e) {
return null;
}
}
public String asString() {
StringBuffer buff = new StringBuffer();
for (int i = 0; i < nodes.size(); i++) {
// final String content = nodes.elementAt(i).toPlainTextString()
// .replaceAll("\n", "").replaceAll("\\t", "").trim();
// if (content.length() > 0) {
// buff.append(" ").append(content);
// }
final String[] lines = nodes.elementAt(i).toPlainTextString()
.split("\n");
for (final String line : lines) {
final String processed = line.trim();
if (processed.length() > 0) {
buff.append(line.trim()).append(" ");
}
}
}
return buff.toString();
}
}

View File

@@ -1,42 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.rogiel.httpchannel.util.htmlparser;
import java.util.regex.Pattern;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.tags.ImageTag;
public class ImagePatternFilter implements NodeFilter {
private static final long serialVersionUID = 1L;
private final Pattern pattern;
public ImagePatternFilter(Pattern pattern) {
this.pattern = pattern;
}
@Override
public boolean accept(Node node) {
if (!(node instanceof ImageTag))
return false;
final ImageTag frame = (ImageTag) node;
return pattern.matcher(frame.getImageURL()).matches();
}
}

View File

@@ -1,44 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.rogiel.httpchannel.util.htmlparser;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.tags.InputTag;
public class InputIDFilter implements NodeFilter {
private static final long serialVersionUID = 1L;
private final String id;
public InputIDFilter(String id) {
this.id = id;
}
@Override
public boolean accept(Node node) {
if (!(node instanceof InputTag))
return false;
final InputTag input = (InputTag) node;
if (input.getAttribute("id") == null)
return false;
if (!input.getAttribute("id").equals(id))
return false;
return true;
}
}

View File

@@ -1,44 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.rogiel.httpchannel.util.htmlparser;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.tags.InputTag;
public class InputNameFilter implements NodeFilter {
private static final long serialVersionUID = 1L;
private final String name;
public InputNameFilter(String name) {
this.name = name;
}
@Override
public boolean accept(Node node) {
if (!(node instanceof InputTag))
return false;
final InputTag input = (InputTag) node;
if (input.getAttribute("name") == null)
return false;
if (!input.getAttribute("name").equals(name))
return false;
return true;
}
}

View File

@@ -1,46 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.rogiel.httpchannel.util.htmlparser;
import java.util.regex.Pattern;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.tags.InputTag;
public class InputValuePatternFilter implements NodeFilter {
private static final long serialVersionUID = 1L;
private final Pattern pattern;
public InputValuePatternFilter(Pattern pattern) {
this.pattern = pattern;
}
@Override
public boolean accept(Node node) {
if (!(node instanceof InputTag))
return false;
final InputTag input = (InputTag) node;
if (input.getAttribute("value") == null)
return false;
if (!pattern.matcher(input.getAttribute("value")).matches())
return false;
return true;
}
}

View File

@@ -1,42 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.rogiel.httpchannel.util.htmlparser;
import java.util.regex.Pattern;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.tags.LinkTag;
public class LinkPatternFilter implements NodeFilter {
private static final long serialVersionUID = 1L;
private final Pattern pattern;
public LinkPatternFilter(Pattern pattern) {
this.pattern = pattern;
}
@Override
public boolean accept(Node node) {
if (!(node instanceof LinkTag))
return false;
final LinkTag link = (LinkTag) node;
return pattern.matcher(link.getLink()).matches();
}
}

View File

@@ -1,44 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.rogiel.httpchannel.util.htmlparser;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Tag;
public class NameFilter implements NodeFilter {
private static final long serialVersionUID = 1L;
private final String name;
public NameFilter(String name) {
this.name = name;
}
@Override
public boolean accept(Node node) {
if (!(node instanceof Tag))
return false;
final Tag tag = (Tag) node;
if (tag.getAttribute("name") == null)
return false;
if (!tag.getAttribute("name").equals(name))
return false;
return true;
}
}

View File

@@ -1,42 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.rogiel.httpchannel.util.htmlparser;
import java.util.regex.Pattern;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.tags.ScriptTag;
public class ScriptContainsFilter implements NodeFilter {
private static final long serialVersionUID = 1L;
private final Pattern pattern;
public ScriptContainsFilter(Pattern pattern) {
this.pattern = pattern;
}
@Override
public boolean accept(Node node) {
if (!(node instanceof ScriptTag))
return false;
final ScriptTag script = (ScriptTag) node;
return pattern.matcher(script.getScriptCode()).find();
}
}

View File

@@ -1,44 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.rogiel.httpchannel.util.htmlparser;
import java.util.regex.Pattern;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.tags.ScriptTag;
public class ScriptSrcFilter implements NodeFilter {
private static final long serialVersionUID = 1L;
private final Pattern pattern;
public ScriptSrcFilter(Pattern pattern) {
this.pattern = pattern;
}
@Override
public boolean accept(Node node) {
if (!(node instanceof ScriptTag))
return false;
final ScriptTag script = (ScriptTag) node;
if (script.getAttribute("src") == null)
return false;
return pattern.matcher(script.getAttribute("src")).matches();
}
}