View Javadoc

1   package net.sourceforge.jgeocoder.us;
2   import static net.sourceforge.jgeocoder.AddressComponent.LINE2;
3   import static net.sourceforge.jgeocoder.AddressComponent.PREDIR;
4   import static net.sourceforge.jgeocoder.AddressComponent.STATE;
5   import static net.sourceforge.jgeocoder.AddressComponent.STREET;
6   import static net.sourceforge.jgeocoder.AddressComponent.TYPE;
7   import static net.sourceforge.jgeocoder.AddressComponent.valueOf;
8   import static net.sourceforge.jgeocoder.us.AddressRegexLibrary.P_CORNER;
9   import static net.sourceforge.jgeocoder.us.AddressRegexLibrary.P_CSZ;
10  import static net.sourceforge.jgeocoder.us.AddressRegexLibrary.P_INTERSECTION;
11  import static net.sourceforge.jgeocoder.us.AddressRegexLibrary.P_STREET_ADDRESS;
12  
13  import java.util.EnumMap;
14  import java.util.HashSet;
15  import java.util.Map;
16  import java.util.Set;
17  import java.util.regex.Matcher;
18  import java.util.regex.Pattern;
19  
20  import net.sourceforge.jgeocoder.AddressComponent;
21  
22  import org.apache.commons.lang.StringUtils;
23  
24  //TODO: support theses
25  //123 Avenue of art, philadelphia pa 12345
26  //PO box 123, abc city, ca 24656
27  //123 Route 29 South, new jersey, 12323  
28  /***
29   * TODO javadocs me
30   * @author jliang
31   *
32   */
33  public class AddressParser{
34  
35    private static final Pattern CORNER = Pattern.compile(P_CORNER.getRegex());
36    private static final Pattern STREET_ADDRESS = Pattern.compile(P_STREET_ADDRESS.getRegex());
37    private static final Pattern CSZ = Pattern.compile(P_CSZ.getRegex());
38    private static final Pattern INTERSECTION = Pattern.compile(P_INTERSECTION.getRegex());
39    private static final Pattern CLEANUP = Pattern.compile("[//s//p{Punct}&&[^//)//(#&,:;@'`-]]");
40    private static final Pattern STREET_TYPES = Pattern.compile(RegexLibrary.STREET_DESIGNATOR);
41    private static final Pattern STATES = Pattern.compile(RegexLibrary.US_STATES);
42    
43    private static String getCleanSttring(String rawAddr){
44      return CLEANUP.matcher(rawAddr).replaceAll(" ").replaceAll("//s+", " ");
45    }
46    /***
47     * Parses a raw address string 
48     * @param rawAddr
49     * @param autoCorrectStateSpelling swith on/off auto correction on state mis-spelling
50     * @return a map of parsed address components
51     */
52    public static Map<AddressComponent, String> parseAddress(String rawAddr, boolean autoCorrectStateSpelling){
53      rawAddr = getCleanSttring(rawAddr);
54      if(autoCorrectStateSpelling){
55        rawAddr = SpellingCorrector.correctStateSpelling(rawAddr);
56      }
57      Matcher m = STREET_ADDRESS.matcher(rawAddr);
58      Map<AddressComponent, String> ret = null;
59      if(m.matches()){
60        ret = getAddrMap(m, P_STREET_ADDRESS.getNamedGroupMap());
61        postProcess(ret);
62        String splitRawAddr = null;
63        String line12sep = ret.get(AddressComponent.TLID);//HACK!
64        if(!line12sep.contains(",") 
65            && (splitRawAddr = designatorConfusingCitiesCorrection(ret, rawAddr))!=null){
66          m = STREET_ADDRESS.matcher(splitRawAddr);
67          if(m.matches()){
68            ret = getAddrMap(m, P_STREET_ADDRESS.getNamedGroupMap());
69            ret.remove(AddressComponent.TLID);//HACK!
70            return ret;
71          }
72        }
73        ret.remove(AddressComponent.TLID);//HACK!
74      }
75      m = CORNER.matcher(rawAddr);
76      if(ret == null && m.find()){
77        m = INTERSECTION.matcher(rawAddr);
78        if(m.matches()){
79          ret = getAddrMap(m, P_INTERSECTION.getNamedGroupMap());
80        }
81      }
82      
83      if(ret == null){
84        m = CSZ.matcher(rawAddr);
85        if(m.matches()){
86          ret = getAddrMap(m, P_CSZ.getNamedGroupMap());
87        }
88      }
89      return ret;
90    }
91    /***
92     * Parses a raw address string, this delegates to {@linkplain AddressParser#parseAddress(String, boolean)} with autoCorrectStateSpelling set to false
93     * @param rawAddr
94     * @return a map of parsed address components
95     */
96    public static Map<AddressComponent, String> parseAddress(String rawAddr){
97      return parseAddress(rawAddr, true);
98    }
99    
100   private static void postProcess(Map<AddressComponent, String> m){
101     //these are (temporary?) hacks...
102     if(m.get(TYPE) == null && m.get(STREET)!= null 
103             && STREET_TYPES.matcher(m.get(STREET).toUpperCase()).matches()){
104       m.put(TYPE, m.get(STREET));
105       m.put(STREET, m.get(PREDIR));
106       m.put(PREDIR, null);
107     }
108     if(m.get(STATE) == null && m.get(LINE2)!= null 
109             && STATES.matcher(m.get(LINE2).toUpperCase()).matches()){
110       m.put(STATE, m.get(LINE2));
111       m.put(LINE2, null);
112     }
113   }
114   
115   private static Map<AddressComponent, String> getAddrMap(Matcher m, Map<Integer, String> groupMap){
116     Map<AddressComponent, String> ret = new EnumMap<AddressComponent, String>(AddressComponent.class);
117     for(int i=1; i<= m.groupCount(); i++){
118       String name = groupMap.get(i);
119       AddressComponent comp = valueOf(name);
120       if(ret.get(comp) == null){
121         putIfNotNull(ret, comp, m.group(i));
122       }
123     }
124     return ret;
125   }
126   
127   private static void putIfNotNull(Map<AddressComponent, String> m , AddressComponent ac, String v){
128     if(v != null)
129       m.put(ac, v);
130   }
131   //TODO: document this craziness
132   private static Pattern STREET_DESIGNATOR_CHECK = Pattern.compile("//b(?i:(?:"+RegexLibrary.STREET_DESIGNATOR+"))//b");
133   private static String designatorConfusingCitiesCorrection(Map<AddressComponent, String> parsedLocation, String input){
134     String street = parsedLocation.get(AddressComponent.STREET);
135     String type = parsedLocation.get(AddressComponent.TYPE);
136     String line2 = parsedLocation.get(AddressComponent.LINE2);
137     if(street == null || type == null || line2 != null || street.split(" ").length < 2){ return null;}
138 	  Matcher m = STREET_DESIGNATOR_CHECK.matcher(street);
139 	  if(m.find()){
140 		  String parsedstate = parsedLocation.get(AddressComponent.STATE);
141 		  if(parsedstate == null){
142 			  String parsedcity = parsedLocation.get(AddressComponent.CITY);
143 			  if(parsedcity != null && parsedcity.length() == 2){
144 				  parsedstate = parsedcity;
145 			  }
146 		  }
147 		  String normalizedState = AddressStandardizer.normalizeState(StringUtils.upperCase(parsedstate));
148 		  String inputUpper =  input.toUpperCase();
149 		  String ret = null;
150 		  Set<String> stateSet = new HashSet<String>();
151 		  if(normalizedState != null){
152 			  stateSet.add(normalizedState);
153 		  }else{ //if no state in put, this needs to work much harder
154 			  stateSet.addAll(SpecialData.C_MAP.keySet());
155 		  }
156 		  int stateIdx = parsedstate == null ? input.length() : input.lastIndexOf(parsedstate);
157 		  for(String state : stateSet){
158 		      for(String s : SpecialData.C_MAP.get(state)){
159 			        int idx = -1;
160 			        if((idx =inputUpper.lastIndexOf(s))!=-1){ //and the input has one of the city names that can confuse the parser
161 			          //this almost guaranteed to break the parser, help the parser by putting a comma separator before the city
162 			        	if(idx+s.length() >= stateIdx -2){
163 			        		return input.substring(0, idx)+","+input.substring(idx);
164 			        	}
165 			        }
166 			      }
167 		  }
168 	      return ret;
169 	  }			
170   return null;
171     
172   }  
173 }