1 package net.sourceforge.jgeocoder.us;
2 import static net.sourceforge.jgeocoder.AddressComponent.LINE2;
3 import static net.sourceforge.jgeocoder.AddressComponent.PREDIR;
4 import static net.sourceforge.jgeocoder.AddressComponent.STATE;
5 import static net.sourceforge.jgeocoder.AddressComponent.STREET;
6 import static net.sourceforge.jgeocoder.AddressComponent.TYPE;
7 import static net.sourceforge.jgeocoder.AddressComponent.valueOf;
8 import static net.sourceforge.jgeocoder.us.AddressRegexLibrary.P_CORNER;
9 import static net.sourceforge.jgeocoder.us.AddressRegexLibrary.P_CSZ;
10 import static net.sourceforge.jgeocoder.us.AddressRegexLibrary.P_INTERSECTION;
11 import static net.sourceforge.jgeocoder.us.AddressRegexLibrary.P_STREET_ADDRESS;
12
13 import java.util.EnumMap;
14 import java.util.HashSet;
15 import java.util.Map;
16 import java.util.Set;
17 import java.util.regex.Matcher;
18 import java.util.regex.Pattern;
19
20 import net.sourceforge.jgeocoder.AddressComponent;
21
22 import org.apache.commons.lang.StringUtils;
23
24
25
26
27
28 /***
29 * TODO javadocs me
30 * @author jliang
31 *
32 */
33 public class AddressParser{
34
35 private static final Pattern CORNER = Pattern.compile(P_CORNER.getRegex());
36 private static final Pattern STREET_ADDRESS = Pattern.compile(P_STREET_ADDRESS.getRegex());
37 private static final Pattern CSZ = Pattern.compile(P_CSZ.getRegex());
38 private static final Pattern INTERSECTION = Pattern.compile(P_INTERSECTION.getRegex());
39 private static final Pattern CLEANUP = Pattern.compile("[//s//p{Punct}&&[^//)//(#&,:;@'`-]]");
40 private static final Pattern STREET_TYPES = Pattern.compile(RegexLibrary.STREET_DESIGNATOR);
41 private static final Pattern STATES = Pattern.compile(RegexLibrary.US_STATES);
42
43 private static String getCleanSttring(String rawAddr){
44 return CLEANUP.matcher(rawAddr).replaceAll(" ").replaceAll("//s+", " ");
45 }
46 /***
47 * Parses a raw address string
48 * @param rawAddr
49 * @param autoCorrectStateSpelling swith on/off auto correction on state mis-spelling
50 * @return a map of parsed address components
51 */
52 public static Map<AddressComponent, String> parseAddress(String rawAddr, boolean autoCorrectStateSpelling){
53 rawAddr = getCleanSttring(rawAddr);
54 if(autoCorrectStateSpelling){
55 rawAddr = SpellingCorrector.correctStateSpelling(rawAddr);
56 }
57 Matcher m = STREET_ADDRESS.matcher(rawAddr);
58 Map<AddressComponent, String> ret = null;
59 if(m.matches()){
60 ret = getAddrMap(m, P_STREET_ADDRESS.getNamedGroupMap());
61 postProcess(ret);
62 String splitRawAddr = null;
63 String line12sep = ret.get(AddressComponent.TLID);
64 if(!line12sep.contains(",")
65 && (splitRawAddr = designatorConfusingCitiesCorrection(ret, rawAddr))!=null){
66 m = STREET_ADDRESS.matcher(splitRawAddr);
67 if(m.matches()){
68 ret = getAddrMap(m, P_STREET_ADDRESS.getNamedGroupMap());
69 ret.remove(AddressComponent.TLID);
70 return ret;
71 }
72 }
73 ret.remove(AddressComponent.TLID);
74 }
75 m = CORNER.matcher(rawAddr);
76 if(ret == null && m.find()){
77 m = INTERSECTION.matcher(rawAddr);
78 if(m.matches()){
79 ret = getAddrMap(m, P_INTERSECTION.getNamedGroupMap());
80 }
81 }
82
83 if(ret == null){
84 m = CSZ.matcher(rawAddr);
85 if(m.matches()){
86 ret = getAddrMap(m, P_CSZ.getNamedGroupMap());
87 }
88 }
89 return ret;
90 }
91 /***
92 * Parses a raw address string, this delegates to {@linkplain AddressParser#parseAddress(String, boolean)} with autoCorrectStateSpelling set to false
93 * @param rawAddr
94 * @return a map of parsed address components
95 */
96 public static Map<AddressComponent, String> parseAddress(String rawAddr){
97 return parseAddress(rawAddr, true);
98 }
99
100 private static void postProcess(Map<AddressComponent, String> m){
101
102 if(m.get(TYPE) == null && m.get(STREET)!= null
103 && STREET_TYPES.matcher(m.get(STREET).toUpperCase()).matches()){
104 m.put(TYPE, m.get(STREET));
105 m.put(STREET, m.get(PREDIR));
106 m.put(PREDIR, null);
107 }
108 if(m.get(STATE) == null && m.get(LINE2)!= null
109 && STATES.matcher(m.get(LINE2).toUpperCase()).matches()){
110 m.put(STATE, m.get(LINE2));
111 m.put(LINE2, null);
112 }
113 }
114
115 private static Map<AddressComponent, String> getAddrMap(Matcher m, Map<Integer, String> groupMap){
116 Map<AddressComponent, String> ret = new EnumMap<AddressComponent, String>(AddressComponent.class);
117 for(int i=1; i<= m.groupCount(); i++){
118 String name = groupMap.get(i);
119 AddressComponent comp = valueOf(name);
120 if(ret.get(comp) == null){
121 putIfNotNull(ret, comp, m.group(i));
122 }
123 }
124 return ret;
125 }
126
127 private static void putIfNotNull(Map<AddressComponent, String> m , AddressComponent ac, String v){
128 if(v != null)
129 m.put(ac, v);
130 }
131
132 private static Pattern STREET_DESIGNATOR_CHECK = Pattern.compile("//b(?i:(?:"+RegexLibrary.STREET_DESIGNATOR+"))//b");
133 private static String designatorConfusingCitiesCorrection(Map<AddressComponent, String> parsedLocation, String input){
134 String street = parsedLocation.get(AddressComponent.STREET);
135 String type = parsedLocation.get(AddressComponent.TYPE);
136 String line2 = parsedLocation.get(AddressComponent.LINE2);
137 if(street == null || type == null || line2 != null || street.split(" ").length < 2){ return null;}
138 Matcher m = STREET_DESIGNATOR_CHECK.matcher(street);
139 if(m.find()){
140 String parsedstate = parsedLocation.get(AddressComponent.STATE);
141 if(parsedstate == null){
142 String parsedcity = parsedLocation.get(AddressComponent.CITY);
143 if(parsedcity != null && parsedcity.length() == 2){
144 parsedstate = parsedcity;
145 }
146 }
147 String normalizedState = AddressStandardizer.normalizeState(StringUtils.upperCase(parsedstate));
148 String inputUpper = input.toUpperCase();
149 String ret = null;
150 Set<String> stateSet = new HashSet<String>();
151 if(normalizedState != null){
152 stateSet.add(normalizedState);
153 }else{
154 stateSet.addAll(SpecialData.C_MAP.keySet());
155 }
156 int stateIdx = parsedstate == null ? input.length() : input.lastIndexOf(parsedstate);
157 for(String state : stateSet){
158 for(String s : SpecialData.C_MAP.get(state)){
159 int idx = -1;
160 if((idx =inputUpper.lastIndexOf(s))!=-1){
161
162 if(idx+s.length() >= stateIdx -2){
163 return input.substring(0, idx)+","+input.substring(idx);
164 }
165 }
166 }
167 }
168 return ret;
169 }
170 return null;
171
172 }
173 }